From 075a6cfa18c93e7d79ace6f94cc3c911b4d82131 Mon Sep 17 00:00:00 2001 From: guinmoon Date: Sun, 16 Jul 2023 18:47:55 +0300 Subject: [PATCH] + add starcoder inference test with tis model https://huggingface.co/mike-ravkine/gpt_bigcode-santacoder-GGML + update ggml to d2b178ee747cfb0a0a1892da362d068080659170 + update llama to 6e7cca404748dd4b1a3affd0d1296e37f4ac0a6f --- LLMFarm.xcodeproj/project.pbxproj | 6 + LLMFarm/AIChatModel.swift | 3 + LLMFarm/Chats/ChatListView.swift | 9 +- LLMFarm/LLMFarmApp.swift | 12 +- LLMFarm/Settings/AddChatView.swift | 22 +- LLMFarm_core/AI.swift | 3 + LLMFarm_core/GPTBase.swift | 2 +- LLMFarm_core/LLaMa.swift | 2 +- LLMFarm_core/Model.swift | 4 +- LLMFarm_core/Starcoder.swift | 36 +- ModelTest/main.swift | 21 +- llmfarm_core.swift/Package.swift | 2 +- .../llmfarm_core/{llama => }/ggml-metal.h | 6 +- .../llmfarm_core/{llama => }/ggml-metal.m | 97 +- .../Sources/llmfarm_core/ggml.c | 4035 ++++++++--------- .../Sources/llmfarm_core/ggml.h | 243 +- .../Sources/llmfarm_core/gpt2/gpt2.cpp | 10 +- .../Sources/llmfarm_core/gptneox/gptneox.cpp | 70 +- .../Sources/llmfarm_core/k_quants.h | 8 + .../Sources/llmfarm_core/llama/llama.cpp | 549 ++- .../Sources/llmfarm_core/replit/replit.cpp | 4 +- .../llmfarm_core/spm-headers/gpt_spm.h | 2 +- .../Sources/llmfarm_core/spm-headers/llama.h | 79 +- .../llmfarm_core/spm-headers/starcoder.h | 44 + .../Sources/llmfarm_core/starcoder/main.cpp | 927 ++++ .../llmfarm_core/starcoder/starcoder-mmap.cpp | 1124 +++++ .../llmfarm_core/starcoder/starcoder.cpp | 1035 +++++ metal/ggml-metal.metal | 239 +- 28 files changed, 5854 insertions(+), 2740 deletions(-) rename llmfarm_core.swift/Sources/llmfarm_core/{llama => }/ggml-metal.h (92%) rename llmfarm_core.swift/Sources/llmfarm_core/{llama => }/ggml-metal.m (93%) create mode 100644 llmfarm_core.swift/Sources/llmfarm_core/spm-headers/starcoder.h create mode 100644 llmfarm_core.swift/Sources/llmfarm_core/starcoder/main.cpp create mode 100644 llmfarm_core.swift/Sources/llmfarm_core/starcoder/starcoder-mmap.cpp create mode 100644 llmfarm_core.swift/Sources/llmfarm_core/starcoder/starcoder.cpp diff --git a/LLMFarm.xcodeproj/project.pbxproj b/LLMFarm.xcodeproj/project.pbxproj index de7da39..90f4a1a 100644 --- a/LLMFarm.xcodeproj/project.pbxproj +++ b/LLMFarm.xcodeproj/project.pbxproj @@ -9,6 +9,8 @@ /* Begin PBXBuildFile section */ 150CAD012A3CE7B30015CD66 /* GPTNeox.swift in Sources */ = {isa = PBXBuildFile; fileRef = 150CAD002A3CE7B30015CD66 /* GPTNeox.swift */; }; 150CAD022A3CE7B30015CD66 /* GPTNeox.swift in Sources */ = {isa = PBXBuildFile; fileRef = 150CAD002A3CE7B30015CD66 /* GPTNeox.swift */; }; + 15141E162A6438AE0060E767 /* Starcoder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 15141E152A6438AE0060E767 /* Starcoder.swift */; }; + 15141E172A6438AE0060E767 /* Starcoder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 15141E152A6438AE0060E767 /* Starcoder.swift */; }; 155FBB9F2A460566004DD5AE /* GPTBase.swift in Sources */ = {isa = PBXBuildFile; fileRef = 155FBB9E2A460566004DD5AE /* GPTBase.swift */; }; 155FBBA02A460566004DD5AE /* GPTBase.swift in Sources */ = {isa = PBXBuildFile; fileRef = 155FBB9E2A460566004DD5AE /* GPTBase.swift */; }; 155FBBA22A48B5C1004DD5AE /* Replit.swift in Sources */ = {isa = PBXBuildFile; fileRef = 155FBBA12A48B5C0004DD5AE /* Replit.swift */; }; @@ -84,6 +86,7 @@ /* Begin PBXFileReference section */ 150CAD002A3CE7B30015CD66 /* GPTNeox.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GPTNeox.swift; sourceTree = ""; }; + 15141E152A6438AE0060E767 /* Starcoder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Starcoder.swift; sourceTree = ""; }; 155FBB9E2A460566004DD5AE /* GPTBase.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GPTBase.swift; sourceTree = ""; }; 155FBBA12A48B5C0004DD5AE /* Replit.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Replit.swift; sourceTree = ""; }; 1560D6162A2D1A3D00918330 /* AddChatView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AddChatView.swift; sourceTree = ""; }; @@ -221,6 +224,7 @@ 1560D61B2A2E11E200918330 /* ExceptionCatcher.h */, 15F210552A1919390021F414 /* AI.swift */, 15F210512A1919390021F414 /* ArrayExt.swift */, + 15141E152A6438AE0060E767 /* Starcoder.swift */, 15F210542A1919390021F414 /* GPTNeoX_old.swift */, 150CAD002A3CE7B30015CD66 /* GPTNeox.swift */, 155FBB9E2A460566004DD5AE /* GPTBase.swift */, @@ -427,6 +431,7 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + 15141E162A6438AE0060E767 /* Starcoder.swift in Sources */, 15F2102C2A190D5B0021F414 /* Math.swift in Sources */, 155FBBA22A48B5C1004DD5AE /* Replit.swift in Sources */, 15F210642A19193A0021F414 /* AI.swift in Sources */, @@ -469,6 +474,7 @@ 15F210A42A1921E40021F414 /* Utils.swift in Sources */, 15A480002A40D2A300B72DC3 /* GPT2.swift in Sources */, 15F210A52A1921E40021F414 /* ComputeGraph.swift in Sources */, + 15141E172A6438AE0060E767 /* Starcoder.swift in Sources */, 1560D6172A2D1A3D00918330 /* AddChatView.swift in Sources */, 15F210962A1920AA0021F414 /* AIChatModel.swift in Sources */, 15F210AA2A1A659B0021F414 /* FileHelper.swift in Sources */, diff --git a/LLMFarm/AIChatModel.swift b/LLMFarm/AIChatModel.swift index d48b05d..d556418 100644 --- a/LLMFarm/AIChatModel.swift +++ b/LLMFarm/AIChatModel.swift @@ -97,6 +97,9 @@ final class AIChatModel: ObservableObject { }else if chat_config!["model_inference"] as! String == "replit" { try? self.chat?.loadModel(ModelInference.Replit,contextParams: model_context_param) self.chat?.model.stop_words.append("<|endoftext|>") + }else if chat_config!["model_inference"] as! String == "starcoder" { + try? self.chat?.loadModel(ModelInference.Starcoder,contextParams: model_context_param) + self.chat?.model.stop_words.append("<|endoftext|>") } } else{ diff --git a/LLMFarm/Chats/ChatListView.swift b/LLMFarm/Chats/ChatListView.swift index 91ef27d..88e6de3 100644 --- a/LLMFarm/Chats/ChatListView.swift +++ b/LLMFarm/Chats/ChatListView.swift @@ -19,9 +19,13 @@ struct ChatListView: View { var close_chat: () -> Void @Binding var edit_chat_dialog: Bool @Binding var chat_selection: String? - + @Binding var renew_chat_list: () -> Void @State var chats_previews = get_chat_list()! + func refresh_chat_list(){ + self.chats_previews = get_chat_list()! + } + func delete(at offsets: IndexSet) { let chatsToDelete = offsets.map { self.chats_previews[$0] } let res = delete_chats(chatsToDelete) @@ -105,6 +109,9 @@ struct ChatListView: View { #endif } .background(.opacity(0)) + }.task { + renew_chat_list = refresh_chat_list + refresh_chat_list() } } diff --git a/LLMFarm/LLMFarmApp.swift b/LLMFarm/LLMFarmApp.swift index 5d893a5..c6e241a 100644 --- a/LLMFarm/LLMFarmApp.swift +++ b/LLMFarm/LLMFarmApp.swift @@ -20,6 +20,7 @@ struct LLMFarmApp: App { @StateObject var orientationInfo = OrientationInfo() @State var isLandscape:Bool = false @State private var chat_selection: String? + @State var renew_chat_list: () -> Void = {} func close_chat() -> Void{ aiChatModel.stop_predict() @@ -36,12 +37,14 @@ struct LLMFarmApp: App { add_chat_dialog:$add_chat_dialog, close_chat:close_chat, edit_chat_dialog:$edit_chat_dialog, - chat_selection:$chat_selection) + chat_selection:$chat_selection, + renew_chat_list: $renew_chat_list) .disabled(edit_chat_dialog) .frame(minWidth: 250, maxHeight: .infinity) }else{ AddChatView(add_chat_dialog: $add_chat_dialog, - edit_chat_dialog:.constant(false)) + edit_chat_dialog: $edit_chat_dialog, + renew_chat_list: $renew_chat_list) .frame(minWidth: 200,maxHeight: .infinity) } } @@ -58,8 +61,9 @@ struct LLMFarmApp: App { } else{ AddChatView(add_chat_dialog: $add_chat_dialog, - edit_chat_dialog:$edit_chat_dialog, - chat_name: aiChatModel.chat_name) + edit_chat_dialog: $edit_chat_dialog, + chat_name: aiChatModel.chat_name, + renew_chat_list: $renew_chat_list) .frame(minWidth: 200,maxHeight: .infinity) #if !os(macOS) .toolbar(.hidden, for: .automatic) diff --git a/LLMFarm/Settings/AddChatView.swift b/LLMFarm/Settings/AddChatView.swift index 6cc7920..c83be96 100644 --- a/LLMFarm/Settings/AddChatView.swift +++ b/LLMFarm/Settings/AddChatView.swift @@ -72,25 +72,29 @@ struct AddChatView: View { @State private var numberOfThreads: Int32 = 0 @State private var use_metal: Bool = false @State private var isImporting: Bool = false + @Binding var renew_chat_list: () -> Void private var chat_name: String = "" let bin_type = UTType(tag: "bin", tagClass: .filenameExtension, conformingTo: nil) @State private var model_inference = "auto" - let model_inferences = ["auto","gptneox", "llama", "gpt2", "replit"] + let model_inferences = ["auto","gptneox", "llama", "gpt2", "replit", "starcoder"] @State private var model_icon: String = "ava0" let model_icons = ["ava0","ava1","ava2","ava3","ava4","ava5","ava6","ava7"] - init(add_chat_dialog: Binding,edit_chat_dialog:Binding) { + init(add_chat_dialog: Binding,edit_chat_dialog:Binding, + renew_chat_list: Binding<() -> Void>) { self._add_chat_dialog = add_chat_dialog self._edit_chat_dialog = edit_chat_dialog + self._renew_chat_list = renew_chat_list } - init(add_chat_dialog: Binding,edit_chat_dialog:Binding,chat_name:String - ) { + init(add_chat_dialog: Binding,edit_chat_dialog:Binding, + chat_name:String,renew_chat_list: Binding<() -> Void>) { self._add_chat_dialog = add_chat_dialog self._edit_chat_dialog = edit_chat_dialog + self._renew_chat_list = renew_chat_list self.chat_name = chat_name let chat_config = get_chat_info(chat_name) if (chat_config!["title"] != nil){ @@ -180,8 +184,14 @@ struct AddChatView: View { "numberOfThreads":Int32(numberOfThreads), "icon":model_icon] let res = create_chat(options,edit_chat_dialog:self.edit_chat_dialog,chat_name:self.chat_name) - add_chat_dialog = false - edit_chat_dialog = false + if add_chat_dialog { + add_chat_dialog = false + + } + if edit_chat_dialog { + edit_chat_dialog = false + } + renew_chat_list() } } label: { Text(edit_chat_dialog ? "Save" :"Add" ) diff --git a/LLMFarm_core/AI.swift b/LLMFarm_core/AI.swift index 009219b..baba19c 100644 --- a/LLMFarm_core/AI.swift +++ b/LLMFarm_core/AI.swift @@ -11,6 +11,7 @@ enum ModelInference { case GPTNeoxInference case GPT2 case Replit + case Starcoder } class AI { @@ -43,6 +44,8 @@ class AI { model = try? GPT2(path: self.modelPath, contextParams: contextParams) case .Replit: model = try? Replit(path: self.modelPath, contextParams: contextParams) + case .Starcoder: + model = try? Starcoder(path: self.modelPath, contextParams: contextParams) } } diff --git a/LLMFarm_core/GPTBase.swift b/LLMFarm_core/GPTBase.swift index 1230706..bda5595 100644 --- a/LLMFarm_core/GPTBase.swift +++ b/LLMFarm_core/GPTBase.swift @@ -22,7 +22,7 @@ public class GPTBase: Model { var params = gpt_context_default_params() params.n_ctx = contextParams.context params.n_parts = contextParams.parts - params.seed = contextParams.seed + params.seed = 0 params.f16_kv = contextParams.f16Kv params.logits_all = contextParams.logitsAll params.vocab_only = contextParams.vocabOnly diff --git a/LLMFarm_core/LLaMa.swift b/LLMFarm_core/LLaMa.swift index 894caa6..91bf535 100644 --- a/LLMFarm_core/LLaMa.swift +++ b/LLMFarm_core/LLaMa.swift @@ -39,7 +39,7 @@ public class LLaMa: GPTBase { var params = llama_context_default_params() params.n_ctx = contextParams.context // params.n_parts = contextParams.parts - params.seed = contextParams.seed + params.seed = UInt32(contextParams.seed) params.f16_kv = contextParams.f16Kv params.logits_all = contextParams.logitsAll params.vocab_only = contextParams.vocabOnly diff --git a/LLMFarm_core/Model.swift b/LLMFarm_core/Model.swift index a3e35bf..ad9a8b1 100644 --- a/LLMFarm_core/Model.swift +++ b/LLMFarm_core/Model.swift @@ -72,7 +72,7 @@ public func get_model_context_param_by_config(_ model_config:Dictionary Bool{ + self.context = starcoder_init_from_file(path, params) + self.promptFormat = .None + return true + } + + deinit { + gpt2_free(context) + } + + public override func gpt_eval(inputBatch:[ModelToken]) throws -> Bool{ + if starcoder_eval(context, inputBatch, Int32(inputBatch.count), nPast, contextParams.numberOfThreads) != 0 { + throw ModelError.failedToEval + } + return true + } + + public override func gpt_init_logits() throws -> Bool { + if starcoder_init_logits(context, contextParams.numberOfThreads) != 0 { + throw ModelError.failedToEval + } + return true + } +} + + diff --git a/ModelTest/main.swift b/ModelTest/main.swift index ae3a84b..9b1e233 100644 --- a/ModelTest/main.swift +++ b/ModelTest/main.swift @@ -37,6 +37,7 @@ func prompt_for_generation(_ instruction:String) -> String{ func main(){ print("Hello.") + var input_text = "State the meaning of life." // let ai = AI(_modelPath: "/Users/guinmoon/Library/Containers/com.guinmoon.LLMFarm/Data/Documents/models/dolly-v2-3b-q5_1.bin",_chatName: "chat") // try? ai.loadModel(ModelInference.GPTNeoxInference) // ai.model.promptFormat = .Dolly_b3 @@ -44,25 +45,27 @@ func main(){ // let ai = AI(_modelPath: "/Users/guinmoon/Library/Containers/com.guinmoon.LLMFarm/Data/Documents/models/AI-Dungeon-2-Classic.bin",_chatName: "chat") // try? ai.loadModel(ModelInference.GPT2) // ai.model.promptFormat = .None - +// // let ai = AI(_modelPath: "/Users/guinmoon/dev/alpaca_llama_etc/replit-code-v1-3b-ggml-q5_1.bin",_chatName: "chat") // try? ai.loadModel(ModelInference.Replit) // ai.model.promptFormat = .None -// #define MacMetal // let ai = AI(_modelPath: "/Users/guinmoon/dev/alpaca_llama_etc/orca-mini-3b.ggmlv3.q4_0.bin",_chatName: "chat") - let ai = AI(_modelPath: "/Users/guinmoon/dev/alpaca_llama_etc/orca-mini-7b.ggmlv3.q3_K_M.bin",_chatName: "chat") - var params:ModelContextParams = .default - params.use_metal = true - try? ai.loadModel(ModelInference.LLamaInference,contextParams: params) - ai.model.promptFormat = .LLaMa - +// let ai = AI(_modelPath: "/Users/guinmoon/dev/alpaca_llama_etc/orca-mini-7b.ggmlv3.q3_K_M.bin",_chatName: "chat") +// var params:ModelContextParams = .default +// params.use_metal = true +// try? ai.loadModel(ModelInference.LLamaInference,contextParams: params) +// ai.model.promptFormat = .LLaMa + let ai = AI(_modelPath: "/Users/guinmoon/dev/alpaca_llama_etc/santacoder-q8_0.bin",_chatName: "chat") + try? ai.loadModel(ModelInference.Starcoder) + ai.model.promptFormat = .None + input_text = "def quicksort" ai.model.contextParams.seed = 0; // ai.model.promptStyle = .StableLM_Tuned - let input_text = "State the meaning of life." + // let input_text = "Tell about Stavropol." // let prompt = prompt_for_generation(input_text) let prompt = input_text diff --git a/llmfarm_core.swift/Package.swift b/llmfarm_core.swift/Package.swift index 9910d56..c429923 100644 --- a/llmfarm_core.swift/Package.swift +++ b/llmfarm_core.swift/Package.swift @@ -18,7 +18,7 @@ let package = Package( targets: [ .target( name: "llmfarm_core", - sources: ["ggml.c","llama/ggml-metal.m","k_quants.c", "gptneox/gptneox.cpp","gpt2/gpt2.cpp","replit/replit.cpp","common.cpp","gpt_helpers.cpp","gpt_spm.cpp", "llama/llama.cpp"], + sources: ["ggml.c","ggml-metal.m","k_quants.c", "gptneox/gptneox.cpp","gpt2/gpt2.cpp","replit/replit.cpp","starcoder/starcoder.cpp","common.cpp","gpt_helpers.cpp","gpt_spm.cpp", "llama/llama.cpp"], publicHeadersPath: "spm-headers", // I'm not sure about some of the flags, please correct it's wrong. cSettings: [ diff --git a/llmfarm_core.swift/Sources/llmfarm_core/llama/ggml-metal.h b/llmfarm_core.swift/Sources/llmfarm_core/ggml-metal.h similarity index 92% rename from llmfarm_core.swift/Sources/llmfarm_core/llama/ggml-metal.h rename to llmfarm_core.swift/Sources/llmfarm_core/ggml-metal.h index b9e50ac..928f170 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/llama/ggml-metal.h +++ b/llmfarm_core.swift/Sources/llmfarm_core/ggml-metal.h @@ -34,9 +34,13 @@ extern "C" { struct ggml_metal_context; -struct ggml_metal_context * ggml_metal_init(void); +// number of command buffers to use +struct ggml_metal_context * ggml_metal_init(int n_cb); void ggml_metal_free(struct ggml_metal_context * ctx); +// set the number of command buffers to use +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); + // creates a mapping between a host memory buffer and a device memory buffer // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute // - the mapping is used during computation to determine the arguments of the compute kernels diff --git a/llmfarm_core.swift/Sources/llmfarm_core/llama/ggml-metal.m b/llmfarm_core.swift/Sources/llmfarm_core/ggml-metal.m similarity index 93% rename from llmfarm_core.swift/Sources/llmfarm_core/llama/ggml-metal.m rename to llmfarm_core.swift/Sources/llmfarm_core/ggml-metal.m index 1fdd8a5..91780f9 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/llama/ggml-metal.m +++ b/llmfarm_core.swift/Sources/llmfarm_core/ggml-metal.m @@ -1,6 +1,6 @@ #import "ggml-metal.h" -#import "../ggml.h" +#import "ggml.h" #import @@ -25,6 +25,8 @@ }; struct ggml_metal_context { + int n_cb; + float * logits; id device; @@ -86,22 +88,14 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end -@interface llmfarm_core_swift_llmfarm_core_SWIFTPM_MODULE_BUNDLER_FINDER_ : NSObject -@end - -@implementation llmfarm_core_swift_llmfarm_core_SWIFTPM_MODULE_BUNDLER_FINDER_ -@end - -struct ggml_metal_context * ggml_metal_init(void) { +struct ggml_metal_context * ggml_metal_init(int n_cb) { fprintf(stderr, "%s: allocating\n", __func__); -// struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + //struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); struct ggml_metal_context * ctx = calloc(1, sizeof(struct ggml_metal_context)); - -// sleep(5); + + ctx->n_cb = n_cb; ctx->device = MTLCreateSystemDefaultDevice(); -// ctx->device = MTLCopyAllDevices().firstObject; -// sleep(5); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; @@ -126,15 +120,13 @@ @implementation llmfarm_core_swift_llmfarm_core_SWIFTPM_MODULE_BUNDLER_FINDER_ } #else UNUSED(msl_library_source); + // read the source from "ggml-metal.metal" into a string and use newLibraryWithSource { NSError * error = nil; + #ifndef CompiledMetal NSString *metal_path = @"/Users/guinmoon/dev/alpaca_llama_etc/LLMFarm/metal/ggml-metal.metal"; -//#else -// NSString *metal_path = [NSBundle.mainBundle.resourcePath stringByAppendingString:@"/ggml-metal.mtl"]; -//#endif -// NSString *metal_path = @"/Users/guinmoon/Library/Developer/Xcode/DerivedData/LLMFarm-bfxpdlswbhfozgepczdsdayclcnh/Build/Products/Debug/llmfarm_core.swift_llmfarm_core.bundle/Contents/Resources/Resources/ggml-metal.metal"; fprintf(stderr, "%s: loading '%s'\n", __func__, [metal_path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:metal_path encoding:NSUTF8StringEncoding error:&error]; if (error) { @@ -142,6 +134,7 @@ @implementation llmfarm_core_swift_llmfarm_core_SWIFTPM_MODULE_BUNDLER_FINDER_ exit(1); } #endif + #ifdef GGML_QKK_64 MTLCompileOptions* options = [MTLCompileOptions new]; options.preprocessorMacros = @{ @"QK_K" : @(64) }; @@ -166,7 +159,7 @@ @implementation llmfarm_core_swift_llmfarm_core_SWIFTPM_MODULE_BUNDLER_FINDER_ #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \ - fprintf(stderr, "%s: loaded %-32s \n", __func__, "kernel_"#name); + fprintf(stderr, "%s: loaded %-32s\n", __func__, "kernel_"#name); GGML_METAL_ADD_KERNEL(add); GGML_METAL_ADD_KERNEL(mul); @@ -181,10 +174,6 @@ @implementation llmfarm_core_swift_llmfarm_core_SWIFTPM_MODULE_BUNDLER_FINDER_ GGML_METAL_ADD_KERNEL(get_rows_q4_0); GGML_METAL_ADD_KERNEL(get_rows_q4_1); GGML_METAL_ADD_KERNEL(get_rows_q2_K); -// ctx->function_get_rows_q4_1 = [ctx->library newFunctionWithName:@"kernel_get_rows_q4_1"]; -// ctx->pipeline_get_rows_q4_1 = [ctx->device newComputePipelineStateWithFunction:ctx->function_get_rows_q4_1 error:nil]; -// ctx->function_get_rows_q2_K = [ctx->library newFunctionWithName:@"kernel_get_rows_q2_K"]; -// ctx->pipeline_get_rows_q2_K = [ctx->device newComputePipelineStateWithFunction:ctx->function_get_rows_q2_K error:nil]; GGML_METAL_ADD_KERNEL(get_rows_q3_K); GGML_METAL_ADD_KERNEL(get_rows_q4_K); GGML_METAL_ADD_KERNEL(get_rows_q5_K); @@ -207,7 +196,6 @@ @implementation llmfarm_core_swift_llmfarm_core_SWIFTPM_MODULE_BUNDLER_FINDER_ #undef GGML_METAL_ADD_KERNEL } - #ifndef TARGET_OS_IPHONE fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); @@ -223,11 +211,15 @@ @implementation llmfarm_core_swift_llmfarm_core_SWIFTPM_MODULE_BUNDLER_FINDER_ void ggml_metal_free(struct ggml_metal_context * ctx) { fprintf(stderr, "%s: deallocating\n", __func__); // for (int i = 0; i < ctx->n_buffers; ++i) { -// [ctx->buffers[i].metal release]; +// [ctx->buffers[i].metal release]; // } free(ctx); } +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { + ctx->n_cb = n_cb; +} + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -329,7 +321,6 @@ bool ggml_metal_add_buffer( ++ctx->n_buffers; } } - #ifndef TARGET_OS_IPHONE fprintf(stderr, ", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, @@ -342,6 +333,7 @@ bool ggml_metal_add_buffer( } #endif } + return true; } @@ -375,7 +367,7 @@ void ggml_metal_graph_compute( // create multiple command buffers and enqueue them // then, we encode the graph into the command buffers in parallel - const int n_cb = gf->n_threads; + const int n_cb = ctx->n_cb; NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; @@ -407,8 +399,8 @@ void ggml_metal_graph_compute( for (int i = node_start; i < node_end; ++i) { metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); - struct ggml_tensor * src0 = gf->nodes[i]->src0; - struct ggml_tensor * src1 = gf->nodes[i]->src1; + struct ggml_tensor * src0 = gf->nodes[i]->src[0]; + struct ggml_tensor * src1 = gf->nodes[i]->src[1]; struct ggml_tensor * dst = gf->nodes[i]; const int64_t ne00 = src0 ? src0->ne[0] : 0; @@ -464,6 +456,7 @@ void ggml_metal_graph_compute( //} switch (dst->op) { + case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_TRANSPOSE: @@ -753,8 +746,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { - [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q3_K || @@ -895,28 +887,35 @@ void ggml_metal_graph_compute( const int n_past = ((int32_t *)(src1->data))[0]; + float freq_base; + float freq_scale; + memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; - [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; + [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; + [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/llmfarm_core.swift/Sources/llmfarm_core/ggml.c b/llmfarm_core.swift/Sources/llmfarm_core/ggml.c index 684caaa..5ce1da0 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/ggml.c +++ b/llmfarm_core.swift/Sources/llmfarm_core/ggml.c @@ -25,16 +25,23 @@ #include #include #include +#include #ifdef GGML_USE_METAL #include #endif +// static_assert should be a #define, but if it's not, +// fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 #ifndef static_assert +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) +#define static_assert(cond, msg) _Static_assert(cond, msg) +#else #define static_assert(cond, msg) struct global_scope_noop_trick #endif +#endif #if defined(_MSC_VER) // disable "possible loss of data" to avoid hundreds of casts @@ -49,23 +56,23 @@ typedef volatile LONG atomic_int; typedef atomic_int atomic_bool; -static void atomic_store(atomic_int* ptr, LONG val) { +static void atomic_store(atomic_int * ptr, LONG val) { InterlockedExchange(ptr, val); } -static LONG atomic_load(atomic_int* ptr) { +static LONG atomic_load(atomic_int * ptr) { return InterlockedCompareExchange(ptr, 0, 0); } -static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) { +static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { return InterlockedExchangeAdd(ptr, inc); } -static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { +static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } typedef HANDLE pthread_t; typedef DWORD thread_ret_t; -static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { +static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) { (void) unused; HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); if (handle == NULL) @@ -77,7 +84,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void return 0; } -static int pthread_join(pthread_t thread, void* unused) { +static int pthread_join(pthread_t thread, void * unused) { (void) unused; return (int) WaitForSingleObject(thread, INFINITE); } @@ -90,7 +97,7 @@ static int sched_yield (void) { #include #include -typedef void* thread_ret_t; +typedef void * thread_ret_t; #include #include @@ -111,10 +118,6 @@ typedef void* thread_ret_t; #endif #endif -#ifdef __HAIKU__ -#define static_assert(cond, msg) _Static_assert(cond, msg) -#endif - /*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 @@ -220,16 +223,38 @@ inline static void* ggml_aligned_malloc(size_t size) { #define GGML_ALIGNED_FREE(ptr) free(ptr) #endif -#define UNUSED(x) (void)(x) +#define UNUSED GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) +// +// tensor access macros +// + +#define GGML_TENSOR_UNARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + +#define GGML_TENSOR_BINARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + #if defined(GGML_USE_ACCELERATE) #include #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions #include "ggml-opencl.h" #endif #elif defined(GGML_USE_OPENBLAS) +#if defined(GGML_BLAS_USE_MKL) +#include +#else #include +#endif #elif defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) @@ -463,14 +488,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) { return GGML_FP32_TO_FP16(x); } -void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) { - for (size_t i = 0; i < n; i++) { +void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) { + for (int i = 0; i < n; i++) { y[i] = GGML_FP16_TO_FP32(x[i]); } } -void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) { - size_t i = 0; +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) { + int i = 0; #if defined(__F16C__) for (; i + 7 < n; i += 8) { __m256 x_vec = _mm256_loadu_ps(x + i); @@ -1609,109 +1634,112 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in } } +static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); +static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { +static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = { + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, + .vec_dot_type = GGML_TYPE_F32, + }, + [GGML_TYPE_F16] = { + .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, + .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, + .vec_dot_type = GGML_TYPE_F16, + }, [GGML_TYPE_Q4_0] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q4_0_q8_0, + .to_float = (ggml_to_float_t) dequantize_row_q4_0, + .from_float = quantize_row_q4_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, + .vec_dot = ggml_vec_dot_q4_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q4_1] = { - .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, - .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = ggml_vec_dot_q4_1_q8_1, + .to_float = (ggml_to_float_t) dequantize_row_q4_1, + .from_float = quantize_row_q4_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, + .vec_dot = ggml_vec_dot_q4_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, }, [GGML_TYPE_Q5_0] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0, - .quantize_row_q = quantize_row_q5_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q5_0_q8_0, + .to_float = (ggml_to_float_t) dequantize_row_q5_0, + .from_float = quantize_row_q5_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, + .vec_dot = ggml_vec_dot_q5_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q5_1] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_1, - .quantize_row_q = quantize_row_q5_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference, - .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = ggml_vec_dot_q5_1_q8_1, + .to_float = (ggml_to_float_t) dequantize_row_q5_1, + .from_float = quantize_row_q5_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, + .vec_dot = ggml_vec_dot_q5_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, }, [GGML_TYPE_Q8_0] = { - .dequantize_row_q = dequantize_row_q8_0, - .quantize_row_q = quantize_row_q8_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q8_0_q8_0, + .to_float = dequantize_row_q8_0, + .from_float = quantize_row_q8_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, + .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q8_1] = { - .dequantize_row_q = NULL, // TODO - .quantize_row_q = quantize_row_q8_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference, - .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = NULL, // TODO + .from_float = quantize_row_q8_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = GGML_TYPE_Q8_1, }, #ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K, - .quantize_row_q = quantize_row_q2_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q2_K_q8_K, + .to_float = (ggml_to_float_t) dequantize_row_q2_K, + .from_float = quantize_row_q2_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, + .vec_dot = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q3_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K, - .quantize_row_q = quantize_row_q3_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q3_K_q8_K, + .to_float = (ggml_to_float_t) dequantize_row_q3_K, + .from_float = quantize_row_q3_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, + .vec_dot = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q4_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K, - .quantize_row_q = quantize_row_q4_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q4_K_q8_K, + .to_float = (ggml_to_float_t) dequantize_row_q4_K, + .from_float = quantize_row_q4_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, + .vec_dot = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q5_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K, - .quantize_row_q = quantize_row_q5_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q5_K_q8_K, + .to_float = (ggml_to_float_t) dequantize_row_q5_K, + .from_float = quantize_row_q5_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, + .vec_dot = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q6_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K, - .quantize_row_q = quantize_row_q6_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = ggml_vec_dot_q6_K_q8_K, + .to_float = (ggml_to_float_t) dequantize_row_q6_K, + .from_float = quantize_row_q6_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, + .vec_dot = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, + [GGML_TYPE_Q8_K] = { + .from_float = quantize_row_q8_K, + } #endif }; // For internal test use -quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { +ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) { GGML_ASSERT(i < GGML_TYPE_COUNT); - return quantize_fns[i]; + return type_traits[i]; } @@ -2257,7 +2285,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } -inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { #ifdef GGML_SIMD float sumf = 0.0f; const int np = (n & ~(GGML_F32_STEP - 1)); @@ -2294,7 +2322,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float *s = sumf; } -inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { +static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { ggml_float sumf = 0.0; #if defined(GGML_SIMD) @@ -3447,6 +3475,8 @@ inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } +inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } +inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } static const float GELU_COEF_A = 0.044715f; @@ -3598,6 +3628,16 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x *s = 1.f/(*s); } +inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { + float max = -INFINITY; + int idx = 0; + for (int i = 0; i < n; ++i) { + max = MAX(max, x[i]); + if (max == x[i]) { idx = i; } + } + *s = idx; +} + // // data types // @@ -3707,12 +3747,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "SUM", "SUM_ROWS", "MEAN", + "ARGMAX", "REPEAT", "REPEAT_BACK", "ABS", "SGN", "NEG", "STEP", + "TANH", + "ELU", "RELU", "GELU", "GELU_QUICK", @@ -3744,9 +3787,10 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ROPE_BACK", "ALIBI", "CLAMP", - "CONV_1D_S1_PH", - "CONV_1D_S2_PH", - "CONV_2D_SK_P0", + "CONV_1D", + "CONV_2D", + "POOL_1D", + "POOL_2D", "FLASH_ATTN", "FLASH_FF", @@ -3765,7 +3809,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64"); +static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3783,12 +3827,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "Σx", "Σx_k", "Σx/n", + "argmax(x)", "repeat(x)", "repeat_back(x)", "abs(x)", "sgn(x)", "-x", "step(x)", + "tanh(x)", + "elu(x)", "relu(x)", "gelu(x)", "gelu_quick(x)", @@ -3820,9 +3867,10 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "rope_back(x)", "alibi(x)", "clamp(x)", - "conv_1d_s1_ph(x)", - "conv_1d_s2_ph(x)", - "conv_2d_sk_p0(x)", + "conv_1d(x)", + "conv_2d(x)", + "pool_1d(x)", + "pool_2d(x)", "flash_attn(x)", "flash_ff(x)", @@ -3841,11 +3889,47 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64"); +static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); + +static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); +// WARN: +// Mis-confguration can lead to problem that's hard to reason about: +// * At best it crash or talks nosense. +// * At worst it talks slightly difference but hard to perceive. +// +// An op has to enable INIT or FINALIZE when any of it's branch needs that pass. +// Take care about compile options (e.g., GGML_USE_xxx). +static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 }; +static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 }; + +static void ggml_setup_op_has_task_pass(void) { + { // INIT + bool * p = GGML_OP_HAS_INIT; + + p[GGML_OP_ACC ] = true; + p[GGML_OP_MUL_MAT ] = true; + p[GGML_OP_OUT_PROD ] = true; + p[GGML_OP_SET ] = true; + p[GGML_OP_GET_ROWS_BACK ] = true; + p[GGML_OP_DIAG_MASK_INF ] = true; + p[GGML_OP_DIAG_MASK_ZERO ] = true; + p[GGML_OP_CONV_1D ] = true; + p[GGML_OP_CONV_2D ] = true; + p[GGML_OP_FLASH_ATTN_BACK ] = true; + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + } + + { // FINALIZE + bool * p = GGML_OP_HAS_FINALIZE; + + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + } +} + // // ggml context // @@ -4086,10 +4170,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) { static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return - (t0->ne[0] == t1->ne[0]) && - (t0->ne[2] == t1->ne[2]) && - (t0->ne[3] == t1->ne[3]); + return (t0->ne[0] == t1->ne[0]) && + (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable + (t1->ne[3]%t0->ne[3] == 0); } static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { @@ -4267,6 +4350,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { ggml_cl_init(); #endif + ggml_setup_op_has_task_pass(); + is_first_call = false; } @@ -4507,17 +4592,14 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.op =*/ GGML_OP_NONE, /*.is_param =*/ false, /*.grad =*/ NULL, - /*.src0 =*/ NULL, - /*.src1 =*/ NULL, - /*.opt =*/ { NULL }, - /*.n_tasks =*/ 0, + /*.src =*/ { NULL }, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - /*.pad =*/ { 0 }, + /*.padding =*/ { 0 }, }; // TODO: this should not be needed as long as we don't rely on aligned SIMD loads @@ -4649,7 +4731,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { { assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); } } break; case GGML_TYPE_F32: @@ -4701,7 +4783,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { { assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); } } break; case GGML_TYPE_F32: @@ -4936,8 +5018,8 @@ struct ggml_tensor * ggml_dup_impl( result->op = GGML_OP_DUP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -4961,11 +5043,15 @@ struct ggml_tensor * ggml_add_impl( struct ggml_tensor * a, struct ggml_tensor * b, bool inplace) { - GGML_ASSERT(ggml_are_same_shape(a, b)); + // TODO: support less-strict constraint + // GGML_ASSERT(ggml_can_repeat(b, a)); + GGML_ASSERT(ggml_can_repeat_rows(b, a)); bool is_node = false; - if (a->grad || b->grad) { + if (!inplace && (a->grad || b->grad)) { + // TODO: support backward pass for broadcasting + GGML_ASSERT(ggml_are_same_shape(a, b)); is_node = true; } @@ -4973,8 +5059,8 @@ struct ggml_tensor * ggml_add_impl( result->op = GGML_OP_ADD; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5013,8 +5099,8 @@ struct ggml_tensor * ggml_add1_impl( result->op = GGML_OP_ADD1; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5071,9 +5157,9 @@ struct ggml_tensor * ggml_acc_impl( result->op = GGML_OP_ACC; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } @@ -5119,8 +5205,8 @@ struct ggml_tensor * ggml_sub_impl( result->op = GGML_OP_SUB; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5166,8 +5252,8 @@ struct ggml_tensor * ggml_mul_impl( result->op = GGML_OP_MUL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5209,8 +5295,8 @@ struct ggml_tensor * ggml_div_impl( result->op = GGML_OP_DIV; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5245,8 +5331,8 @@ struct ggml_tensor * ggml_sqr_impl( result->op = GGML_OP_SQR; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5279,8 +5365,8 @@ struct ggml_tensor * ggml_sqrt_impl( result->op = GGML_OP_SQRT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5314,8 +5400,8 @@ struct ggml_tensor * ggml_log_impl( result->op = GGML_OP_LOG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5347,8 +5433,8 @@ struct ggml_tensor * ggml_sum( result->op = GGML_OP_SUM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5374,8 +5460,8 @@ struct ggml_tensor * ggml_sum_rows( result->op = GGML_OP_SUM_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5397,8 +5483,32 @@ struct ggml_tensor * ggml_mean( result->op = GGML_OP_MEAN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; + + return result; +} + +// ggml_argmax + +struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a) { + GGML_ASSERT(ggml_is_matrix(a)); + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); + is_node = true; + } + + int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne); + + result->op = GGML_OP_ARGMAX; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5425,8 +5535,8 @@ struct ggml_tensor * ggml_repeat( result->op = GGML_OP_REPEAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5453,8 +5563,8 @@ struct ggml_tensor * ggml_repeat_back( result->op = GGML_OP_REPEAT_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5475,8 +5585,8 @@ struct ggml_tensor * ggml_abs_impl( result->op = GGML_OP_ABS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5510,8 +5620,8 @@ struct ggml_tensor * ggml_sgn_impl( result->op = GGML_OP_SGN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5544,8 +5654,8 @@ struct ggml_tensor * ggml_neg_impl( result->op = GGML_OP_NEG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5578,8 +5688,8 @@ struct ggml_tensor * ggml_step_impl( result->op = GGML_OP_STEP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5596,6 +5706,74 @@ struct ggml_tensor * ggml_step_inplace( return ggml_step_impl(ctx, a, true); } +// ggml_tanh + +struct ggml_tensor * ggml_tanh_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_TANH; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; + + return result; +} + +struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_tanh_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_tanh_impl(ctx, a, true); +} + +// ggml_elu + +struct ggml_tensor * ggml_elu_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_ELU; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; + + return result; +} + +struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_elu_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_elu_impl(ctx, a, true); +} + // ggml_relu struct ggml_tensor * ggml_relu_impl( @@ -5612,8 +5790,8 @@ struct ggml_tensor * ggml_relu_impl( result->op = GGML_OP_RELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5646,8 +5824,8 @@ struct ggml_tensor * ggml_gelu_impl( result->op = GGML_OP_GELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5680,8 +5858,8 @@ struct ggml_tensor * ggml_gelu_quick_impl( result->op = GGML_OP_GELU_QUICK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5714,8 +5892,8 @@ struct ggml_tensor * ggml_silu_impl( result->op = GGML_OP_SILU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5749,8 +5927,8 @@ struct ggml_tensor * ggml_silu_back( result->op = GGML_OP_SILU_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5772,8 +5950,8 @@ struct ggml_tensor * ggml_norm_impl( result->op = GGML_OP_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; // TODO: maybe store epsilon here? + result->src[0] = a; + result->src[1] = NULL; // TODO: maybe store epsilon here? return result; } @@ -5804,8 +5982,8 @@ struct ggml_tensor * ggml_rms_norm_impl( result->op = GGML_OP_RMS_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; // TODO: maybe store epsilon here? + result->src[0] = a; + result->src[1] = NULL; // TODO: maybe store epsilon here? return result; } @@ -5837,8 +6015,8 @@ struct ggml_tensor * ggml_rms_norm_back( result->op = GGML_OP_RMS_NORM_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5859,13 +6037,13 @@ struct ggml_tensor * ggml_mul_mat( is_node = true; } - const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); result->op = GGML_OP_MUL_MAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5890,8 +6068,8 @@ struct ggml_tensor * ggml_out_prod( result->op = GGML_OP_OUT_PROD; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5916,8 +6094,8 @@ struct ggml_tensor * ggml_scale_impl( result->op = GGML_OP_SCALE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5972,9 +6150,9 @@ struct ggml_tensor * ggml_set_impl( result->op = GGML_OP_SET; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } @@ -6061,8 +6239,8 @@ struct ggml_tensor * ggml_cpy_impl( result->op = GGML_OP_CPY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6098,8 +6276,8 @@ struct ggml_tensor * ggml_cont_impl( result->op = GGML_OP_CONT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6142,8 +6320,8 @@ struct ggml_tensor * ggml_reshape( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6167,8 +6345,8 @@ struct ggml_tensor * ggml_reshape_1d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6193,8 +6371,8 @@ struct ggml_tensor * ggml_reshape_2d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6220,8 +6398,8 @@ struct ggml_tensor * ggml_reshape_3d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6249,8 +6427,8 @@ struct ggml_tensor * ggml_reshape_4d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6282,9 +6460,9 @@ struct ggml_tensor * ggml_view_1d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = offs; return result; } @@ -6324,9 +6502,9 @@ struct ggml_tensor * ggml_view_2d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = offs; return result; } @@ -6368,9 +6546,9 @@ struct ggml_tensor * ggml_view_3d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = offs; return result; } @@ -6414,9 +6592,9 @@ struct ggml_tensor * ggml_view_4d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = offs; return result; } @@ -6476,8 +6654,8 @@ struct ggml_tensor * ggml_permute( result->op = GGML_OP_PERMUTE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; if (is_node) { ggml_scratch_save(ctx); @@ -6491,7 +6669,7 @@ struct ggml_tensor * ggml_permute( ggml_scratch_load(ctx); - result->opt[0] = b; + result->src[2] = b; } return result; @@ -6519,8 +6697,8 @@ struct ggml_tensor * ggml_transpose( result->op = GGML_OP_TRANSPOSE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6545,8 +6723,8 @@ struct ggml_tensor * ggml_get_rows( result->op = GGML_OP_GET_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6573,9 +6751,9 @@ struct ggml_tensor * ggml_get_rows_back( result->op = GGML_OP_GET_ROWS_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } @@ -6597,8 +6775,8 @@ struct ggml_tensor * ggml_diag( result->op = GGML_OP_DIAG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6630,8 +6808,8 @@ struct ggml_tensor * ggml_diag_mask_inf_impl( result->op = GGML_OP_DIAG_MASK_INF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6678,8 +6856,8 @@ struct ggml_tensor * ggml_diag_mask_zero_impl( result->op = GGML_OP_DIAG_MASK_ZERO; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6714,8 +6892,8 @@ struct ggml_tensor * ggml_soft_max_impl( result->op = GGML_OP_SOFT_MAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6750,8 +6928,8 @@ struct ggml_tensor * ggml_soft_max_back_impl( result->op = GGML_OP_SOFT_MAX_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6778,6 +6956,8 @@ struct ggml_tensor * ggml_rope_impl( int n_past, int n_dims, int mode, + float freq_base, + float freq_scale, int n_ctx, bool inplace) { GGML_ASSERT(n_past >= 0); @@ -6791,19 +6971,21 @@ struct ggml_tensor * ggml_rope_impl( ggml_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = n_dims; ((int32_t *) b->data)[2] = mode; ((int32_t *) b->data)[3] = n_ctx; + memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float)); + memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float)); ggml_scratch_load(ctx); result->op = GGML_OP_ROPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6815,7 +6997,7 @@ struct ggml_tensor * ggml_rope( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false); } struct ggml_tensor * ggml_rope_inplace( @@ -6825,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true); +} + +struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float freq_base, + float freq_scale, + int n_ctx) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true); } // ggml_rope_back @@ -6837,6 +7031,8 @@ struct ggml_tensor * ggml_rope_back( int n_dims, int mode) { GGML_ASSERT(n_past >= 0); + GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); + bool is_node = false; if (a->grad) { @@ -6858,8 +7054,8 @@ struct ggml_tensor * ggml_rope_back( result->op = GGML_OP_ROPE_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6897,8 +7093,8 @@ struct ggml_tensor * ggml_alibi( result->op = GGML_OP_ALIBI; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6931,21 +7127,27 @@ struct ggml_tensor * ggml_clamp( result->op = GGML_OP_CLAMP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } -// ggml_conv_1d_s1_ph +// ggml_conv_1d + +static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) { + return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; +} -struct ggml_tensor * ggml_conv_1d_s1_ph( +GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * b, + int s0, + int p0, + int d0) { GGML_ASSERT(ggml_is_matrix(b)); GGML_ASSERT(a->ne[1] == b->ne[1]); - GGML_ASSERT(a->ne[3] == 1); bool is_node = false; if (a->grad || b->grad) { @@ -6953,68 +7155,175 @@ struct ggml_tensor * ggml_conv_1d_s1_ph( is_node = true; } - const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + const int64_t ne[4] = { + ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), + a->ne[2], 1, 1, + }; + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + + ggml_scratch_save(ctx); + struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); + ((int32_t*)c->data)[0] = s0; + ((int32_t*)c->data)[1] = p0; + ((int32_t*)c->data)[2] = d0; + ggml_scratch_load(ctx); + + result->op = GGML_OP_CONV_1D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + + return result; +} + +// ggml_conv_2d + +struct ggml_tensor* ggml_conv_2d( + struct ggml_context* ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + + GGML_ASSERT(a->ne[2] == b->ne[2]); + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), + ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), + a->ne[3], b->ne[3], + }; + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + ggml_scratch_save(ctx); + struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6); + ((int32_t*)c->data)[0] = s0; + ((int32_t*)c->data)[1] = s1; + ((int32_t*)c->data)[2] = p0; + ((int32_t*)c->data)[3] = p1; + ((int32_t*)c->data)[4] = d0; + ((int32_t*)c->data)[5] = d1; + ggml_scratch_load(ctx); - result->op = GGML_OP_CONV_1D_S1_PH; + result->op = GGML_OP_CONV_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; + } -// ggml_conv_1d_s2_ph +// ggml_conv_1d_ph -struct ggml_tensor * ggml_conv_1d_s2_ph( +struct ggml_tensor* ggml_conv_1d_ph( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { - GGML_ASSERT(ggml_is_matrix(b)); - GGML_ASSERT(a->ne[1] == b->ne[1]); - GGML_ASSERT(a->ne[3] == 1); + struct ggml_tensor * b, + int s, + int d) { + return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); +} + + +// ggml_pool_* + +static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) { + return (ins + 2 * p - ks) / s + 1; +} + +// ggml_pool_2d + +struct ggml_tensor* ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int s0, + int p0) { + bool is_node = false; - if (a->grad || b->grad) { + if (a->grad) { GGML_ASSERT(false); // TODO: implement backward is_node = true; } - const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + const int64_t ne[3] = { + ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), + a->ne[1], + }; + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + + ggml_scratch_save(ctx); + struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); + ((int32_t*)c->data)[0] = op; + ((int32_t*)c->data)[1] = k0; + ((int32_t*)c->data)[2] = s0; + ((int32_t*)c->data)[3] = p0; + ggml_scratch_load(ctx); - result->op = GGML_OP_CONV_1D_S2_PH; + result->op = GGML_OP_POOL_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = c; return result; } -// ggml_conv_2d_sk_p0 +// ggml_pool_2d -struct ggml_tensor * ggml_conv_2d_sk_p0( +struct ggml_tensor* ggml_pool_2d( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { - GGML_ASSERT(b->ne[3] == 1); - GGML_ASSERT(a->ne[2] == b->ne[2]); - GGML_ASSERT(b->ne[0] % a->ne[0] == 0); - GGML_ASSERT(b->ne[1] % a->ne[1] == 0); + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + int p0, + int p1) { + bool is_node = false; - if (a->grad || b->grad) { + if (a->grad) { GGML_ASSERT(false); // TODO: implement backward is_node = true; } - const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + const int64_t ne[3] = { + ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), + ggml_calc_pool_output_size(a->ne[1], k1, s1, p1), + a->ne[2], + }; + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); - result->op = GGML_OP_CONV_2D_SK_P0; + ggml_scratch_save(ctx); + struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7); + ((int32_t*)c->data)[0] = op; + ((int32_t*)c->data)[1] = k0; + ((int32_t*)c->data)[2] = k1; + ((int32_t*)c->data)[3] = s0; + ((int32_t*)c->data)[4] = s1; + ((int32_t*)c->data)[5] = p0; + ((int32_t*)c->data)[6] = p1; + ggml_scratch_load(ctx); + + result->op = GGML_OP_POOL_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = c; return result; } @@ -7041,10 +7350,10 @@ struct ggml_tensor * ggml_flash_attn( result->op = GGML_OP_FLASH_ATTN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = q; - result->src1 = k; - result->opt[0] = v; - result->opt[1] = ggml_new_i32(ctx, masked ? 1 : 0); + result->src[0] = q; + result->src[1] = k; + result->src[2] = v; + result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0); return result; } @@ -7072,11 +7381,11 @@ struct ggml_tensor * ggml_flash_ff( result->op = GGML_OP_FLASH_FF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b0; - result->opt[0] = b1; - result->opt[1] = c0; - result->opt[2] = c1; + result->src[0] = a; + result->src[1] = b0; + result->src[2] = b1; + result->src[3] = c0; + result->src[4] = c1; return result; } @@ -7136,11 +7445,11 @@ struct ggml_tensor * ggml_flash_attn_back( result->op = GGML_OP_FLASH_ATTN_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = q; - result->src1 = k; - result->opt[0] = v; - result->opt[1] = d; - result->opt[2] = ggml_new_i32(ctx, masked ? 1 : 0); + result->src[0] = q; + result->src[1] = k; + result->src[2] = v; + result->src[3] = d; + result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0); return result; } @@ -7185,9 +7494,9 @@ struct ggml_tensor * ggml_win_part( result->op = GGML_OP_WIN_PART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = b; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = b; return result; } @@ -7222,9 +7531,9 @@ struct ggml_tensor * ggml_win_unpart( result->op = GGML_OP_WIN_UNPART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = b; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = b; return result; } @@ -7253,8 +7562,8 @@ struct ggml_tensor * ggml_map_unary_impl_f32( result->op = GGML_OP_MAP_UNARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->opt[0] = addr_tensor; + result->src[0] = a; + result->src[2] = addr_tensor; return result; } @@ -7300,9 +7609,9 @@ struct ggml_tensor * ggml_map_binary_impl_f32( result->op = GGML_OP_MAP_BINARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = addr_tensor; + result->src[0] = a; + result->src[1] = b; + result->src[2] = addr_tensor; return result; } @@ -7347,8 +7656,8 @@ struct ggml_tensor * ggml_map_custom1_impl_f32( result->op = GGML_OP_MAP_CUSTOM1; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->opt[0] = addr_tensor; + result->src[0] = a; + result->src[2] = addr_tensor; return result; } @@ -7392,9 +7701,9 @@ struct ggml_tensor * ggml_map_custom2_impl_f32( result->op = GGML_OP_MAP_CUSTOM2; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = addr_tensor; + result->src[0] = a; + result->src[1] = b; + result->src[2] = addr_tensor; return result; } @@ -7441,10 +7750,10 @@ struct ggml_tensor * ggml_map_custom3_impl_f32( result->op = GGML_OP_MAP_CUSTOM3; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = addr_tensor; - result->opt[1] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = addr_tensor; + result->src[3] = c; return result; } @@ -7484,8 +7793,8 @@ struct ggml_tensor * ggml_cross_entropy_loss( result->op = GGML_OP_CROSS_ENTROPY_LOSS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -7504,9 +7813,9 @@ struct ggml_tensor * ggml_cross_entropy_loss_back( result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; result->grad = NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } @@ -7566,25 +7875,7 @@ static void ggml_compute_forward_dup_f16( return; } - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -7657,8 +7948,8 @@ static void ggml_compute_forward_dup_f16( id += ne00 * (ne01 - ir1); } } - } else if (ggml_is_quantized(dst->type)) { - quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + } else if (type_traits[dst->type].from_float) { + ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; size_t id = 0; @@ -7855,25 +8146,7 @@ static void ggml_compute_forward_dup_f32( return; } - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -7928,26 +8201,8 @@ static void ggml_compute_forward_dup_f32( id += rs * (ne01 - ir1); } } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (ggml_is_quantized(dst->type)) { - quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + } else if (type_traits[dst->type].from_float) { + ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; size_t id = 0; size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); @@ -8161,7 +8416,7 @@ static void ggml_compute_forward_add_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -8171,24 +8426,8 @@ static void ggml_compute_forward_add_f32( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -8202,23 +8441,23 @@ static void ggml_compute_forward_add_f32( if (nb10 == sizeof(float)) { for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); #ifdef GGML_USE_ACCELERATE - vDSP_vadd( - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, - ne0); + vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00); #else - ggml_vec_add_f32(ne0, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); #endif // } // } @@ -8226,15 +8465,20 @@ static void ggml_compute_forward_add_f32( } else { // src1 is not contiguous for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + for (int i0 = 0; i0 < ne0; i0++) { - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10); dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; } @@ -8257,28 +8501,12 @@ static void ggml_compute_forward_add_f16_f32( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); @@ -8327,24 +8555,8 @@ static void ggml_compute_forward_add_f16_f16( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16); @@ -8394,32 +8606,15 @@ static void ggml_compute_forward_add_q_f32( } const int nr = ggml_nrows(src0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; const enum ggml_type type = src0->type; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + ggml_from_float_t const quantize_row_q = type_traits[type].from_float; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); @@ -8533,19 +8728,8 @@ static void ggml_compute_forward_add1_f32( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -8599,23 +8783,12 @@ static void ggml_compute_forward_add1_f16_f32( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); @@ -8660,23 +8833,12 @@ static void ggml_compute_forward_add1_f16_f16( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); @@ -8721,23 +8883,12 @@ static void ggml_compute_forward_add1_q_f32( const int nth = params->nth; const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; const enum ggml_type type = src0->type; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + ggml_from_float_t const quantize_row_q = type_traits[type].from_float; // we don't support permuted src0 GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); @@ -8865,15 +9016,8 @@ static void ggml_compute_forward_acc_f32( const int nr = ggml_nrows(src1); const int nc = src1->ne[0]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); // src0 and dst as viewed during acc const size_t nb0 = ggml_element_size(src0); @@ -8962,24 +9106,8 @@ static void ggml_compute_forward_sub_f32( } const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9069,29 +9197,7 @@ static void ggml_compute_forward_mul_f32( const int64_t nr = ggml_nrows(src0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9179,24 +9285,8 @@ static void ggml_compute_forward_div_f32( } const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9403,14 +9493,8 @@ static void ggml_compute_forward_sum_f32( assert(ggml_is_scalar(dst)); assert(src0->nb[0] == sizeof(float)); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); ggml_float sum = 0; ggml_float row_sum = 0; @@ -9459,29 +9543,13 @@ static void ggml_compute_forward_sum_rows_f32( GGML_ASSERT(src0->nb[0] == sizeof(float)); GGML_ASSERT(dst->nb[0] == sizeof(float)); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; + GGML_TENSOR_UNARY_OP_LOCALS; GGML_ASSERT(ne0 == 1); GGML_ASSERT(ne1 == ne01); GGML_ASSERT(ne2 == ne02); GGML_ASSERT(ne3 == ne03); - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - for (int64_t i3 = 0; i3 < ne03; i3++) { for (int64_t i2 = 0; i2 < ne02; i2++) { for (int64_t i1 = 0; i1 < ne01; i1++) { @@ -9525,19 +9593,7 @@ static void ggml_compute_forward_mean_f32( assert(src0->nb[0] == sizeof(float)); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; + GGML_TENSOR_UNARY_OP_LOCALS; assert(ne0 == 1); assert(ne1 == ne01); @@ -9549,10 +9605,6 @@ static void ggml_compute_forward_mean_f32( UNUSED(ne2); UNUSED(ne3); - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { @@ -9582,38 +9634,66 @@ static void ggml_compute_forward_mean( } } -// ggml_compute_forward_repeat +// ggml_compute_forward_argmax -static void ggml_compute_forward_repeat_f32( +static void ggml_compute_forward_argmax_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_can_repeat(src0, dst)); + assert(params->ith == 0); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; + assert(src0->nb[0] == sizeof(float)); + assert(dst->nb[0] == sizeof(float)); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - - const size_t nb00 = src0->nb[0]; const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; + const size_t nb0 = dst->nb[0]; + + for (int64_t i1 = 0; i1 < ne01; i1++) { + float * src = (float *) ((char *) src0->data + i1*nb01); + int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0); + int v = 0; + ggml_vec_argmax_f32(ne00, &v, src); + dst_[0] = v; + } +} + +static void ggml_compute_forward_argmax( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_argmax_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_repeat + +static void ggml_compute_forward_repeat_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_UNARY_OP_LOCALS; // guaranteed to be an integer due to the check in ggml_can_repeat const int nr0 = (int)(ne0/ne00); @@ -9674,25 +9754,7 @@ static void ggml_compute_forward_repeat_back_f32( return; } - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; // guaranteed to be an integer due to the check in ggml_can_repeat const int nr0 = (int)(ne00/ne0); @@ -9922,6 +9984,90 @@ static void ggml_compute_forward_step( } } +// ggml_compute_forward_tanh + +static void ggml_compute_forward_tanh_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_tanh_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_tanh( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_tanh_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_elu + +static void ggml_compute_forward_elu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_elu_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_elu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_elu_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_relu static void ggml_compute_forward_relu_f32( @@ -10223,18 +10369,7 @@ static void ggml_compute_forward_norm_f32( const int ith = params->ith; const int nth = params->nth; - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; const float eps = 1e-5f; // TODO: make this a parameter @@ -10300,18 +10435,7 @@ static void ggml_compute_forward_rms_norm_f32( const int ith = params->ith; const int nth = params->nth; - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; const float eps = 1e-6f; // TODO: make this a parameter @@ -10376,22 +10500,7 @@ static void ggml_compute_forward_rms_norm_back_f32( const int ith = params->ith; const int nth = params->nth; - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; - - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const float eps = 1e-6f; // TODO: make this a parameter @@ -10548,409 +10657,37 @@ static void ggml_compute_forward_rms_norm_back( } } - // ggml_compute_forward_mul_mat #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster static bool ggml_compute_forward_mul_mat_use_blas( - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - //const int64_t ne00 = src0->ne[0]; - //const int64_t ne01 = src0->ne[1]; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - - /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ - return true; - } - - return false; -} -#endif - -static void ggml_compute_forward_mul_mat_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); - UNUSED(t0); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - const int64_t ne10 = src1->ne[0]; -#endif - const int64_t ne11 = src1->ne[1]; -#ifndef NDEBUG - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int nb00 = src0->nb[0]; -#endif - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - -#ifndef NDEBUG - const int nb10 = src1->nb[0]; -#endif - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - - const int ith = params->ith; - const int nth = params->nth; - - assert(ne02 == ne12); - assert(ne03 == ne13); - assert(ne2 == ne12); - assert(ne3 == ne13); - - // we don't support permuted src0 or src1 - assert(nb00 == sizeof(float)); - assert(nb10 == sizeof(float)); - - // dst cannot be transposed or permuted - assert(nb0 == sizeof(float)); - assert(nb0 <= nb1); - assert(nb1 <= nb2); - assert(nb2 <= nb3); - - assert(ne0 == ne01); - assert(ne1 == ne11); - assert(ne2 == ne02); - assert(ne3 == ne03); - - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows - -#if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } - return; - } -#endif - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - if (params->ith != 0) { - return; - } - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - - return; - } -#endif - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } - - // parallelize by src0 rows using ggml_vec_dot_f32 - - // total rows in src0 - const int nr = ne01*ne02*ne03; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); - - for (int64_t ic = 0; ic < ne11; ++ic) { - // src1 indices - const int i13 = i03; - const int i12 = i02; - const int i11 = ic; - - // dst indices - const int i0 = i01; - const int i1 = i11; - const int i2 = i02; - const int i3 = i03; - - ggml_vec_dot_f32(ne00, - (float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)), - (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); - } - } - - //int64_t t1 = ggml_perf_time_us(); - //static int64_t acc = 0; - //acc += t1 - t0; - //if (t1 - t0 > 10) { - // printf("\n"); - // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); - // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); - // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); - // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); - - // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); - //} -} - -static void ggml_compute_forward_mul_mat_f16_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); - UNUSED(t0); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - //const int64_t ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - - const int ith = params->ith; - const int nth = params->nth; - - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - - // TODO: we don't support permuted src0 - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); - - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows - -#if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } - return; - } -#endif - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - GGML_ASSERT(nb10 == sizeof(float)); - - if (params->ith != 0) { - return; - } - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - float * const wdata = params->wdata; - { - size_t id = 0; - for (int64_t i01 = 0; i01 < ne01; ++i01) { - for (int64_t i00 = 0; i00 < ne00; ++i00) { - wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); - } - } - - assert(id*sizeof(float) <= params->wsize); - } - - const float * x = wdata; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // zT = y * xT - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - - /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ - - return; - } -#endif - - if (params->type == GGML_TASK_INIT) { - ggml_fp16_t * const wdata = params->wdata; - - size_t id = 0; - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = 0; i11 < ne11; ++i11) { - for (int64_t i10 = 0; i10 < ne10; ++i10) { - wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); - } - } - } - } - - GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize); - - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } - - // fp16 -> half the size, so divide by 2 - // TODO: do not support transposed src1 - assert(nb10/2 == sizeof(ggml_fp16_t)); - - // parallelize by src0 rows using ggml_vec_dot_f16 - - // total rows in src0 - const int nr = ne01*ne02*ne03; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - ggml_fp16_t * wdata = params->wdata; - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int i13 = i03; - const int i12 = i02; + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + //const int64_t ne00 = src0->ne[0]; + //const int64_t ne01 = src0->ne[1]; - const int i0 = i01; - const int i2 = i02; - const int i3 = i03; + const int64_t ne10 = src1->ne[0]; - ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - ggml_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00; + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; - float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); + // TODO: find the optimal values for these + if (ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - for (int64_t ic = 0; ic < ne11; ++ic) { - ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); - } + /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ + return true; } - //int64_t t1 = ggml_time_us(); - //static int64_t acc = 0; - //acc += t1 - t0; - //if (t1 - t0 > 10) { - // printf("\n"); - // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); - // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); - // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); - - // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); - //} + return false; } +#endif -static void ggml_compute_forward_mul_mat_q_f32( +static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -10958,51 +10695,26 @@ static void ggml_compute_forward_mul_mat_q_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - const enum ggml_type type = src0->type; - quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot; - vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; - enum ggml_type const vec_dot_type = quantize_fns[type].vec_dot_type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted @@ -11011,16 +10723,16 @@ static void ggml_compute_forward_mul_mat_q_f32( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); - // nb01 >= nb00 - src0 is not transposed // compute by src0 rows #if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(src0, src1, dst)) { + // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension + // ref: https://github.com/ggerganov/ggml/pull/224 + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); } @@ -11030,6 +10742,11 @@ static void ggml_compute_forward_mul_mat_q_f32( #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension + // ref: https://github.com/ggerganov/ggml/pull/224 + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + if (params->ith != 0) { return; } @@ -11042,27 +10759,27 @@ static void ggml_compute_forward_mul_mat_q_f32( return; } - float * const wdata = params->wdata; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i03*nb03 + i02*nb02; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - { + if (type != GGML_TYPE_F32) { + float * const wdata = params->wdata; + ggml_to_float_t const to_float = type_traits[type].to_float; + size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { - dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); + to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); id += ne00; } assert(id*sizeof(float) <= params->wsize); + x = wdata; } - const float * x = wdata; - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, @@ -11078,14 +10795,16 @@ static void ggml_compute_forward_mul_mat_q_f32( #endif if (params->type == GGML_TASK_INIT) { - char * wdata = params->wdata; - const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; - - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = 0; i11 < ne11; ++i11) { - quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); - wdata += row_size; + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } } } } @@ -11097,43 +10816,52 @@ static void ggml_compute_forward_mul_mat_q_f32( return; } - // parallelize by src0 rows using ggml_vec_dot_q + // parallelize by src0 rows + const int64_t dr = (ne01 + nth - 1)/nth; - // total rows in src0 - const int nr = ne01*ne02*ne03; - - // rows per thread - const int dr = (nr + nth - 1)/nth; + const int64_t ir10 = dr*ith; + const int64_t ir11 = MIN(ir10 + dr, ne01); - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + // src1 rows + const int64_t nr1 = ne11*ne12*ne13; - void * wdata = params->wdata; - const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { + const int64_t i13 = (ir1/(ne12*ne11)); + const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; + const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); - const int i13 = i03; - const int i12 = i02; + const int64_t ir0 = (ir1/ne11)%(ne02*ne03); + const int64_t i03 = (ir0/(ne02)); + // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2. + // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: + // GG: this is likely the correct way to broadcast, though need some more thought + // therefore leaving the comments to remind us for now + const int64_t i02 = (i12 / (ne12 / ne02)); + // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) + // const int64_t i02 = (ir0 - i03*ne02); - const int i0 = i01; - const int i2 = i02; - const int i3 = i03; + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; - void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*row_size)); + const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 ); - float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); - assert(ne00 % 32 == 0); + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); - for (int64_t ic = 0; ic < ne11; ++ic) { - vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); + for (int64_t ir = ir10; ir < ir11; ++ir) { + vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col); } } @@ -11150,40 +10878,6 @@ static void ggml_compute_forward_mul_mat_q_f32( //} } -static void ggml_compute_forward_mul_mat( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - switch (src0->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - { - ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_mul_mat_f32(params, src0, src1, dst); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} // ggml_compute_forward_out_prod @@ -11196,35 +10890,7 @@ static void ggml_compute_forward_out_prod_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - //const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -11459,15 +11125,8 @@ static void ggml_compute_forward_set_f32( const int nr = ggml_nrows(src1); const int nc = src1->ne[0]; - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - const size_t nb10 = src1->nb[0]; - const size_t nb11 = src1->nb[1]; - const size_t nb12 = src1->nb[2]; - const size_t nb13 = src1->nb[3]; + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); // src0 and dst as viewed during set const size_t nb0 = ggml_element_size(src0); @@ -11608,7 +11267,7 @@ static void ggml_compute_forward_get_rows_q( const int nc = src0->ne[0]; const int nr = ggml_nelements(src1); const enum ggml_type type = src0->type; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; assert( dst->ne[0] == nc); assert( dst->ne[1] == nr); @@ -11858,29 +11517,14 @@ static void ggml_compute_forward_diag_f32( // TODO: handle transposed/permuted matrices - const int ne00 = src0->ne[0]; - const int ne01 = src0->ne[1]; - const int ne02 = src0->ne[2]; - const int ne03 = src0->ne[3]; - const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - const int ne3 = dst->ne[3]; + GGML_TENSOR_UNARY_OP_LOCALS; + GGML_ASSERT(ne00 == ne0); GGML_ASSERT(ne00 == ne1); GGML_ASSERT(ne01 == 1); GGML_ASSERT(ne02 == ne2); GGML_ASSERT(ne03 == ne3); - const int nb00 = src0->nb[0]; - //const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb0 == sizeof(float)); @@ -12214,7 +11858,7 @@ static void ggml_compute_forward_alibi_f32( const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne1 = src0->ne[1]; // seq_len_without_past - //const int ne2 = src0->ne[2]; // n_head -> this is k + const int ne2 = src0->ne[2]; // n_head -> this is k //const int ne3 = src0->ne[3]; // 1 -> bsz const int n = ggml_nrows(src0); @@ -12225,8 +11869,9 @@ static void ggml_compute_forward_alibi_f32( const int nb2 = src0->nb[2]; //const int nb3 = src0->nb[3]; - assert(nb0 == sizeof(float)); - assert(ne1 + n_past == ne0); (void) n_past; + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(ne1 + n_past == ne0); + GGML_ASSERT(n_head == ne2); // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); @@ -12250,7 +11895,7 @@ static void ggml_compute_forward_alibi_f32( m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); } - pdst[0] = (i-ne0+1) * m_k + src[0]; + pdst[0] = i * m_k + src[0]; } } @@ -12279,7 +11924,7 @@ static void ggml_compute_forward_alibi_f16( const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne1 = src0->ne[1]; // seq_len_without_past - //const int ne2 = src0->ne[2]; // n_head -> this is k + const int ne2 = src0->ne[2]; // n_head -> this is k //const int ne3 = src0->ne[3]; // 1 -> bsz const int n = ggml_nrows(src0); @@ -12290,8 +11935,9 @@ static void ggml_compute_forward_alibi_f16( const int nb2 = src0->nb[2]; //const int nb3 = src0->nb[3]; - assert(nb0 == sizeof(ggml_fp16_t)); - assert(ne1 + n_past == ne0); (void) n_past; + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(ne1 + n_past == ne0); (void) n_past; + GGML_ASSERT(n_head == ne2); // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); @@ -12316,7 +11962,7 @@ static void ggml_compute_forward_alibi_f16( } // we return F32 - pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]); + pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]); } } } @@ -12444,33 +12090,25 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src1, struct ggml_tensor * dst) { GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 4); + GGML_ASSERT(ggml_nelements(src1) == 6); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } + float freq_base; + float freq_scale; + const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; const int n_ctx = ((int32_t *) src1->data)[3]; + memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); assert(n_past >= 0); - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12495,7 +12133,7 @@ static void ggml_compute_forward_rope_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f/n_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12507,7 +12145,7 @@ static void ggml_compute_forward_rope_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = freq_scale * (float)p; if (is_glm) { theta = MIN(p, n_ctx - 2); @@ -12584,33 +12222,25 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src1, struct ggml_tensor * dst) { GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 4); + GGML_ASSERT(ggml_nelements(src1) == 6); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } + float freq_base; + float freq_scale; + const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; const int n_ctx = ((int32_t *) src1->data)[3]; + memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); assert(n_past >= 0); - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12635,7 +12265,7 @@ static void ggml_compute_forward_rope_f16( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f/n_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12647,7 +12277,7 @@ static void ggml_compute_forward_rope_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = freq_scale * (float)p; if (is_glm) { theta = MIN(p, n_ctx - 2); @@ -12708,7 +12338,7 @@ static void ggml_compute_forward_rope_f16( const float x0 = GGML_FP16_TO_FP32(src[0]); const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } @@ -12763,21 +12393,7 @@ static void ggml_compute_forward_rope_back_f32( assert(n_past >= 0); - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12876,21 +12492,7 @@ static void ggml_compute_forward_rope_back_f16( assert(n_past >= 0); - const size_t nb00 = src0->nb[0]; - const size_t nb01 = src0->nb[1]; - const size_t nb02 = src0->nb[2]; - const size_t nb03 = src0->nb[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const size_t nb0 = dst->nb[0]; - const size_t nb1 = dst->nb[1]; - const size_t nb2 = dst->nb[2]; - const size_t nb3 = dst->nb[3]; - + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12988,7 +12590,7 @@ static void ggml_compute_forward_rope_back( } } -// ggml_compute_forward_conv_1d_s1_ph +// ggml_compute_forward_conv_1d static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( const struct ggml_compute_params * params, @@ -13002,36 +12604,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - //const int64_t ne12 = src1->ne[2]; - //const int64_t ne13 = src1->ne[3]; - - //const int64_t ne0 = dst->ne[0]; - //const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - //const int64_t ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - //const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - //const int nb12 = src1->nb[2]; - //const int nb13 = src1->nb[3]; - - //const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - //const int nb2 = dst->nb[2]; - //const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13122,36 +12695,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - //const int64_t ne12 = src1->ne[2]; - //const int64_t ne13 = src1->ne[3]; - - //const int64_t ne0 = dst->ne[0]; - //const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - //const int64_t ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - //const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - //const int nb12 = src1->nb[2]; - //const int nb13 = src1->nb[3]; - - //const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - //const int nb2 = dst->nb[2]; - //const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13251,8 +12795,6 @@ static void ggml_compute_forward_conv_1d_s1_ph( } } -// ggml_compute_forward_conv_1d_s2_ph - static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -13265,36 +12807,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - //const int64_t ne12 = src1->ne[2]; - //const int64_t ne13 = src1->ne[3]; - - //const int64_t ne0 = dst->ne[0]; - //const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - //const int64_t ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - //const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - //const int nb12 = src1->nb[2]; - //const int nb13 = src1->nb[3]; - - //const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - //const int nb2 = dst->nb[2]; - //const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13385,36 +12898,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - //const int64_t ne12 = src1->ne[2]; - //const int64_t ne13 = src1->ne[3]; - - //const int64_t ne0 = dst->ne[0]; - //const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - //const int64_t ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - const int nb01 = src0->nb[1]; - const int nb02 = src0->nb[2]; - //const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - //const int nb12 = src1->nb[2]; - //const int nb13 = src1->nb[3]; - - //const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - //const int nb2 = dst->nb[2]; - //const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13514,12 +12998,35 @@ static void ggml_compute_forward_conv_1d_s2_ph( } } -// ggml_compute_forward_conv_2d_sk_p0 +// ggml_compute_forward_conv_1d + +static void ggml_compute_forward_conv_1d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + const int32_t s0 = ((const int32_t*)(opt0->data))[0]; + const int32_t p0 = ((const int32_t*)(opt0->data))[1]; + const int32_t d0 = ((const int32_t*)(opt0->data))[2]; + GGML_ASSERT(d0 == 1); // dilation not supported + GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported + if (s0 == 1) { + ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst); + } else if (s0 == 2) { + ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst); + } else { + GGML_ASSERT(false); // only stride 1 and 2 supported + }; +} + +// ggml_compute_forward_conv_2d -static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( +static void ggml_compute_forward_conv_2d_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13528,36 +13035,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int ne00 = src0->ne[0]; - const int ne01 = src0->ne[1]; - const int ne02 = src0->ne[2]; - //const int ne03 = src0->ne[3]; - - const int ne10 = src1->ne[0]; - //const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - //const int ne13 = src1->ne[3]; - - const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - //const int ne3 = dst->ne[3]; - //const int ne = ne0*ne1*ne2*ne3; - - const int nb00 = src0->nb[0]; - //const int nb01 = src0->nb[1]; - //const int nb02 = src0->nb[2]; - const int nb03 = src0->nb[3]; - - const int nb10 = src1->nb[0]; - //const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - //const int nb13 = src1->nb[3]; - - //const int nb0 = dst->nb[0]; - //const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - //const int nb3 = dst->nb[3]; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13568,11 +13046,17 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( // size of the convolution row - the kernel size unrolled across all channels const int ew0 = nk0*nk1*ne02; + const int32_t s0 = ((const int32_t*)(opt0->data))[0]; + const int32_t s1 = ((const int32_t*)(opt0->data))[1]; + const int32_t p0 = ((const int32_t*)(opt0->data))[2]; + const int32_t p1 = ((const int32_t*)(opt0->data))[3]; + const int32_t d0 = ((const int32_t*)(opt0->data))[4]; + const int32_t d1 = ((const int32_t*)(opt0->data))[5]; + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { - // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); // prepare source data (src1) @@ -13587,8 +13071,13 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( for (int i0 = 0; i0 < ne0; i0++) { for (int ik1 = 0; ik1 < nk1; ik1++) { for (int ik0 = 0; ik0 < nk0; ik0++) { - dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]); + const int idx0 = i0*s0 + ik0*d0 - p0; + const int idx1 = i1*s1 + ik1*d1 - p1; + + if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { + dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = + GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); + } } } } @@ -13596,60 +13085,225 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( } } - return; - } + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total patches in dst + const int np = ne2; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ip0; i2 < ip1; i2++) { + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2); + + for (int i1 = 0; i1 < ne1; ++i1) { + for (int i0 = 0; i0 < ne0; ++i0) { + ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, + (ggml_fp16_t *) ((char *) src0->data + i2*nb03), + (ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0); + } + } + } + } +} + +static void ggml_compute_forward_conv_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst + ) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst); + } break; + case GGML_TYPE_F32: + { + //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst); + GGML_ASSERT(false); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_pool_1d_sk_p0 + +static void ggml_compute_forward_pool_1d_sk_p0( + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const struct ggml_tensor * src, + const int k, + struct ggml_tensor * dst) { + assert(src->type == GGML_TYPE_F32); + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const char * cdata = (const char *)src->data; + const char * const data_end = cdata + ggml_nbytes(src); + float * drow = (float *)dst->data; + + const int64_t rs = dst->ne[0]; + + while (cdata < data_end) { + const float * const srow = (const float *)cdata; + + int j = 0; + + for (int64_t i = 0; i < rs; ++i) { + switch (op) { + case GGML_OP_POOL_AVG: drow[i] = 0; break; + case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + for (int ki = 0; ki < k; ++ki) { + switch (op) { + case GGML_OP_POOL_AVG: drow[i] += srow[j]; break; + case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + ++j; + } + switch (op) { + case GGML_OP_POOL_AVG: drow[i] /= k; break; + case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + } + + cdata += src->nb[1]; + drow += rs; + } +} + +// ggml_compute_forward_pool_1d + +static void ggml_compute_forward_pool_1d( + const struct ggml_compute_params* params, + const struct ggml_tensor* src0, + const struct ggml_tensor* opt0, + struct ggml_tensor* dst) { + GGML_ASSERT(opt0->ne[0] == 4); + const int* opts = (const int*)opt0->data; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int s0 = opts[2]; + const int p0 = opts[3]; + GGML_ASSERT(p0 == 0); // padding not supported + GGML_ASSERT(k0 == s0); // only s = k supported + + ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst); +} + +// ggml_compute_forward_pool_2d_sk_p0 + +static void ggml_compute_forward_pool_2d_sk_p0( + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const struct ggml_tensor * src, + const int k0, + const int k1, + struct ggml_tensor * dst) { + assert(src->type == GGML_TYPE_F32); + assert(params->ith == 0); - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - // total patches in dst - const int np = ne2; + const char * cdata = (const char*)src->data; + const char * const data_end = cdata + ggml_nbytes(src); - // patches per thread - const int dp = (np + nth - 1)/nth; + const int64_t px = dst->ne[0]; + const int64_t py = dst->ne[1]; + const int64_t pa = px * py; - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); + float * dplane = (float *)dst->data; - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + const int ka = k0 * k1; - for (int i2 = ip0; i2 < ip1; i2++) { - float * dst_data = (float *)((char *) dst->data + i2*nb2); + while (cdata < data_end) { + for (int oy = 0; oy < py; ++oy) { + float * const drow = dplane + oy * px; + for (int ox = 0; ox < px; ++ox) { + float * const out = drow + ox; + switch (op) { + case GGML_OP_POOL_AVG: *out = 0; break; + case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } - for (int i1 = 0; i1 < ne1; ++i1) { - for (int i0 = 0; i0 < ne0; ++i0) { - ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, - (ggml_fp16_t *) ((char *) src0->data + i2*nb03), - (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0); + const int ix = ox * k0; + const int iy = oy * k1; + + for (int ky = 0; ky < k1; ++ky) { + const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); + for (int kx = 0; kx < k0; ++kx) { + int j = ix + kx; + switch (op) { + case GGML_OP_POOL_AVG: *out += srow[j]; break; + case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + } + } + switch (op) { + case GGML_OP_POOL_AVG: *out /= ka; break; + case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } } } + + cdata += src->nb[2]; + dplane += pa; } } -static void ggml_compute_forward_conv_2d_sk_p0( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - switch (src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst); - } break; - case GGML_TYPE_F32: - { - //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst); - GGML_ASSERT(false); - } break; - default: - { - GGML_ASSERT(false); - } break; - } +// ggml_compute_forward_pool_2d + +static void ggml_compute_forward_pool_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(opt0->ne[0] == 7); + const int* opts = (const int*)opt0->data; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + GGML_ASSERT(p0 == 0); + GGML_ASSERT(p1 == 0); // padding not supported + GGML_ASSERT(k0 == s0); + GGML_ASSERT(k1 == s1); // only s = k supported + + ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst); } + // ggml_compute_forward_flash_attn static void ggml_compute_forward_flash_attn_f32( @@ -13662,45 +13316,14 @@ static void ggml_compute_forward_flash_attn_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t neq0 = q->ne[0]; - const int64_t neq1 = q->ne[1]; - const int64_t neq2 = q->ne[2]; - const int64_t neq3 = q->ne[3]; - - const int64_t nek0 = k->ne[0]; - const int64_t nek1 = k->ne[1]; - //const int64_t nek2 = k->ne[2]; - //const int64_t nek3 = k->ne[3]; - - //const int64_t nev0 = v->ne[0]; - const int64_t nev1 = v->ne[1]; - //const int64_t nev2 = v->ne[2]; - //const int64_t nev3 = v->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - - const int nbk0 = k->nb[0]; - const int nbk1 = k->nb[1]; - const int nbk2 = k->nb[2]; - const int nbk3 = k->nb[3]; - - const int nbq0 = q->nb[0]; - const int nbq1 = q->nb[1]; - const int nbq2 = q->nb[2]; - const int nbq3 = q->nb[3]; - - const int nbv0 = v->nb[0]; - const int nbv1 = v->nb[1]; - const int nbv2 = v->nb[2]; - const int nbv3 = v->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -13871,45 +13494,14 @@ static void ggml_compute_forward_flash_attn_f16( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t neq0 = q->ne[0]; - const int64_t neq1 = q->ne[1]; - const int64_t neq2 = q->ne[2]; - const int64_t neq3 = q->ne[3]; - - const int64_t nek0 = k->ne[0]; - const int64_t nek1 = k->ne[1]; - //const int64_t nek2 = k->ne[2]; - //const int64_t nek3 = k->ne[3]; - - //const int64_t nev0 = v->ne[0]; - const int64_t nev1 = v->ne[1]; - //const int64_t nev2 = v->ne[2]; - //const int64_t nev3 = v->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - //const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - - const int nbk0 = k->nb[0]; - const int nbk1 = k->nb[1]; - const int nbk2 = k->nb[2]; - const int nbk3 = k->nb[3]; - - const int nbq0 = q->nb[0]; - const int nbq1 = q->nb[1]; - const int nbq2 = q->nb[2]; - const int nbq3 = q->nb[3]; - - const int nbv0 = v->nb[0]; - const int nbv1 = v->nb[1]; - const int nbv2 = v->nb[2]; - const int nbv3 = v->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -14143,65 +13735,18 @@ static void ggml_compute_forward_flash_ff_f16( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t nea0 = a->ne[0]; - const int64_t nea1 = a->ne[1]; - const int64_t nea2 = a->ne[2]; - const int64_t nea3 = a->ne[3]; - - const int64_t neb00 = b0->ne[0]; - const int64_t neb01 = b0->ne[1]; - //const int64_t neb02 = b0->ne[2]; - //const int64_t neb03 = b0->ne[3]; - - const int64_t neb10 = b1->ne[0]; - const int64_t neb11 = b1->ne[1]; - //const int64_t neb12 = b1->ne[2]; - //const int64_t neb13 = b1->ne[3]; - - const int64_t nec00 = c0->ne[0]; - const int64_t nec01 = c0->ne[1]; - //const int64_t nec02 = c0->ne[2]; - //const int64_t nec03 = c0->ne[3]; - - const int64_t nec10 = c1->ne[0]; - const int64_t nec11 = c1->ne[1]; - //const int64_t nec12 = c1->ne[2]; - //const int64_t nec13 = c1->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - //const int64_t ne3 = dst->ne[3]; - - const int nba0 = a->nb[0]; - const int nba1 = a->nb[1]; - const int nba2 = a->nb[2]; - const int nba3 = a->nb[3]; - - const int nbb00 = b0->nb[0]; - const int nbb01 = b0->nb[1]; - const int nbb02 = b0->nb[2]; - const int nbb03 = b0->nb[3]; - - const int nbb10 = b1->nb[0]; - //const int nbb11 = b1->nb[1]; - //const int nbb12 = b1->nb[2]; - //const int nbb13 = b1->nb[3]; - - const int nbc00 = c0->nb[0]; - const int nbc01 = c0->nb[1]; - const int nbc02 = c0->nb[2]; - const int nbc03 = c0->nb[3]; - - const int nbc10 = c1->nb[0]; - //const int nbc11 = c1->nb[1]; - //const int nbc12 = c1->nb[2]; - //const int nbc13 = c1->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_LOCALS(int64_t, nea, a, ne); + GGML_TENSOR_LOCALS(size_t, nba, a, nb); + GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne); + GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb); + GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne); + GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb); + GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne); + GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb); + GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne); + GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -14349,55 +13894,16 @@ static void ggml_compute_forward_flash_attn_back_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - const int64_t neq0 = q->ne[0]; - const int64_t neq1 = q->ne[1]; - const int64_t neq2 = q->ne[2]; - const int64_t neq3 = q->ne[3]; - - const int64_t nek0 = k->ne[0]; - const int64_t nek1 = k->ne[1]; - //const int64_t nek2 = k->ne[2]; - //const int64_t nek3 = k->ne[3]; - - const int64_t nev0 = v->ne[0]; - const int64_t nev1 = v->ne[1]; - //const int64_t nev2 = v->ne[2]; - //const int64_t nev3 = v->ne[3]; - - const int64_t ned0 = d->ne[0]; - const int64_t ned1 = d->ne[1]; - //const int64_t ned2 = d->ne[2]; - //const int64_t ned3 = d->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - - const int nbk0 = k->nb[0]; - const int nbk1 = k->nb[1]; - const int nbk2 = k->nb[2]; - const int nbk3 = k->nb[3]; - - const int nbq0 = q->nb[0]; - const int nbq1 = q->nb[1]; - const int nbq2 = q->nb[2]; - const int nbq3 = q->nb[3]; - - const int nbv0 = v->nb[0]; - const int nbv1 = v->nb[1]; - const int nbv2 = v->nb[2]; - const int nbv3 = v->nb[3]; - - const int nbd0 = d->nb[0]; - const int nbd1 = d->nb[1]; - const int nbd2 = d->nb[2]; - const int nbd3 = d->nb[3]; - - const int nb0 = dst->nb[0]; - const int nb1 = dst->nb[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ned, d, ne); + GGML_TENSOR_LOCALS(size_t, nbd, d, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -14755,15 +14261,8 @@ static void ggml_compute_forward_win_part_f32( return; } - const int64_t ne00 = src0->ne[0]; UNUSED(ne00); - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; UNUSED(ne03); - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; UNUSED(ne3); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); const int32_t nep0 = ((const int32_t *)(opt0->data))[0]; const int32_t nep1 = ((const int32_t *)(opt0->data))[1]; @@ -14826,14 +14325,8 @@ static void ggml_compute_forward_win_unpart_f32( return; } - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - //const int64_t ne03 = src0->ne[3]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); const int32_t w = ((const int32_t *)(opt0->data))[0]; @@ -15374,279 +14867,295 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); #endif // GGML_USE_CUBLAS switch (tensor->op) { case GGML_OP_DUP: { - ggml_compute_forward_dup(params, tensor->src0, tensor); + ggml_compute_forward_dup(params, tensor->src[0], tensor); } break; case GGML_OP_ADD: { - ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ADD1: { - ggml_compute_forward_add1(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ACC: { - ggml_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_SUB: { - ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_MUL: { - ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_DIV: { - ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_SQR: { - ggml_compute_forward_sqr(params, tensor->src0, tensor); + ggml_compute_forward_sqr(params, tensor->src[0], tensor); } break; case GGML_OP_SQRT: { - ggml_compute_forward_sqrt(params, tensor->src0, tensor); + ggml_compute_forward_sqrt(params, tensor->src[0], tensor); } break; case GGML_OP_LOG: { - ggml_compute_forward_log(params, tensor->src0, tensor); + ggml_compute_forward_log(params, tensor->src[0], tensor); } break; case GGML_OP_SUM: { - ggml_compute_forward_sum(params, tensor->src0, tensor); + ggml_compute_forward_sum(params, tensor->src[0], tensor); } break; case GGML_OP_SUM_ROWS: { - ggml_compute_forward_sum_rows(params, tensor->src0, tensor); + ggml_compute_forward_sum_rows(params, tensor->src[0], tensor); } break; case GGML_OP_MEAN: { - ggml_compute_forward_mean(params, tensor->src0, tensor); + ggml_compute_forward_mean(params, tensor->src[0], tensor); + } break; + case GGML_OP_ARGMAX: + { + ggml_compute_forward_argmax(params, tensor->src[0], tensor); } break; case GGML_OP_REPEAT: { - ggml_compute_forward_repeat(params, tensor->src0, tensor); + ggml_compute_forward_repeat(params, tensor->src[0], tensor); } break; case GGML_OP_REPEAT_BACK: { - ggml_compute_forward_repeat_back(params, tensor->src0, tensor); + ggml_compute_forward_repeat_back(params, tensor->src[0], tensor); } break; case GGML_OP_ABS: { - ggml_compute_forward_abs(params, tensor->src0, tensor); + ggml_compute_forward_abs(params, tensor->src[0], tensor); } break; case GGML_OP_SGN: { - ggml_compute_forward_sgn(params, tensor->src0, tensor); + ggml_compute_forward_sgn(params, tensor->src[0], tensor); } break; case GGML_OP_NEG: { - ggml_compute_forward_neg(params, tensor->src0, tensor); + ggml_compute_forward_neg(params, tensor->src[0], tensor); } break; case GGML_OP_STEP: { - ggml_compute_forward_step(params, tensor->src0, tensor); + ggml_compute_forward_step(params, tensor->src[0], tensor); + } break; + case GGML_OP_TANH: + { + ggml_compute_forward_tanh(params, tensor->src[0], tensor); + } break; + case GGML_OP_ELU: + { + ggml_compute_forward_elu(params, tensor->src[0], tensor); } break; case GGML_OP_RELU: { - ggml_compute_forward_relu(params, tensor->src0, tensor); + ggml_compute_forward_relu(params, tensor->src[0], tensor); } break; case GGML_OP_GELU: { - ggml_compute_forward_gelu(params, tensor->src0, tensor); + ggml_compute_forward_gelu(params, tensor->src[0], tensor); } break; case GGML_OP_GELU_QUICK: { - ggml_compute_forward_gelu_quick(params, tensor->src0, tensor); + ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor); } break; case GGML_OP_SILU: { - ggml_compute_forward_silu(params, tensor->src0, tensor); + ggml_compute_forward_silu(params, tensor->src[0], tensor); } break; case GGML_OP_SILU_BACK: { - ggml_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_NORM: { - ggml_compute_forward_norm(params, tensor->src0, tensor); + ggml_compute_forward_norm(params, tensor->src[0], tensor); } break; case GGML_OP_RMS_NORM: { - ggml_compute_forward_rms_norm(params, tensor->src0, tensor); + ggml_compute_forward_rms_norm(params, tensor->src[0], tensor); } break; case GGML_OP_RMS_NORM_BACK: { - ggml_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_OUT_PROD: { - ggml_compute_forward_out_prod(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_SCALE: { - ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_SET: { - ggml_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_CPY: { - ggml_compute_forward_cpy(params, tensor->src0, tensor); + ggml_compute_forward_cpy(params, tensor->src[0], tensor); } break; case GGML_OP_CONT: { - ggml_compute_forward_cont(params, tensor->src0, tensor); + ggml_compute_forward_cont(params, tensor->src[0], tensor); } break; case GGML_OP_RESHAPE: { - ggml_compute_forward_reshape(params, tensor->src0, tensor); + ggml_compute_forward_reshape(params, tensor->src[0], tensor); } break; case GGML_OP_VIEW: { - ggml_compute_forward_view(params, tensor->src0); + ggml_compute_forward_view(params, tensor->src[0]); } break; case GGML_OP_PERMUTE: { - ggml_compute_forward_permute(params, tensor->src0); + ggml_compute_forward_permute(params, tensor->src[0]); } break; case GGML_OP_TRANSPOSE: { - ggml_compute_forward_transpose(params, tensor->src0); + ggml_compute_forward_transpose(params, tensor->src[0]); } break; case GGML_OP_GET_ROWS: { - ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_GET_ROWS_BACK: { - ggml_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_DIAG: { - ggml_compute_forward_diag(params, tensor->src0, tensor); + ggml_compute_forward_diag(params, tensor->src[0], tensor); } break; case GGML_OP_DIAG_MASK_INF: { - ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_DIAG_MASK_ZERO: { - ggml_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_SOFT_MAX: { - ggml_compute_forward_soft_max(params, tensor->src0, tensor); + ggml_compute_forward_soft_max(params, tensor->src[0], tensor); } break; case GGML_OP_SOFT_MAX_BACK: { - ggml_compute_forward_soft_max_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ROPE: { - ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ROPE_BACK: { - ggml_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ALIBI: { - ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_CLAMP: { - ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_1D: + { + ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; - case GGML_OP_CONV_1D_S1_PH: + case GGML_OP_CONV_2D: { - ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; - case GGML_OP_CONV_1D_S2_PH: + case GGML_OP_POOL_1D: { - ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor); } break; - case GGML_OP_CONV_2D_SK_P0: + case GGML_OP_POOL_2D: { - ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_FLASH_ATTN: { - const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); + const int32_t t = ggml_get_i32_1d(tensor->src[3], 0); GGML_ASSERT(t == 0 || t == 1); const bool masked = t != 0; - ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor); + ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor); } break; case GGML_OP_FLASH_FF: { - ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); + ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor); } break; case GGML_OP_FLASH_ATTN_BACK: { - int32_t t = ggml_get_i32_1d(tensor->opt[2], 0); + int32_t t = ggml_get_i32_1d(tensor->src[4], 0); GGML_ASSERT(t == 0 || t == 1); bool masked = t != 0; - ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor); + ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor); } break; case GGML_OP_WIN_PART: { - ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor); + ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor); } break; case GGML_OP_WIN_UNPART: { - ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor); + ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor); } break; case GGML_OP_MAP_UNARY: { - const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun); + const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data); + ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun); } break; case GGML_OP_MAP_BINARY: { - const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun); + const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data); + ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); } break; case GGML_OP_MAP_CUSTOM1: { - const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun); + const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data); + ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun); } break; case GGML_OP_MAP_CUSTOM2: { - const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun); + const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data); + ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun); } break; case GGML_OP_MAP_CUSTOM3: { - const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun); + const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data); + ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun); } break; case GGML_OP_CROSS_ENTROPY_LOSS: { - ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - ggml_compute_forward_cross_entropy_loss_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_NONE: @@ -15663,8 +15172,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm //////////////////////////////////////////////////////////////////////////////// static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) { - struct ggml_tensor * src0 = tensor->src0; - struct ggml_tensor * src1 = tensor->src1; + struct ggml_tensor * src0 = tensor->src[0]; + struct ggml_tensor * src1 = tensor->src[1]; switch (tensor->op) { case GGML_OP_DUP: @@ -15700,12 +15209,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5); - GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32); - const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0]; - const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1]; - const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2]; - const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3]; + GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5); + GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32); + const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0]; + const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1]; + const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2]; + const size_t offset = (( int32_t * ) tensor->src[2]->data)[3]; struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx, tensor->grad, @@ -15830,6 +15339,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } } break; case GGML_OP_MEAN: + case GGML_OP_ARGMAX: { GGML_ASSERT(false); // TODO: implement } break; @@ -15883,6 +15393,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // noop } } break; + case GGML_OP_TANH: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_ELU: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_RELU: { if (src0->grad) { @@ -15902,14 +15420,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_ALIBI: - { - GGML_ASSERT(false); // TODO: not implemented - } break; - case GGML_OP_CLAMP: - { - GGML_ASSERT(false); // TODO: not implemented - } break; case GGML_OP_SILU: { // necessary for llama @@ -16012,12 +15522,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_SET: { - GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5); - GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32); - const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0]; - const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1]; - const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2]; - const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3]; + GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5); + GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32); + const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0]; + const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1]; + const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2]; + const size_t offset = (( int32_t * ) tensor->src[2]->data)[3]; struct ggml_tensor * tensor_grad_view = NULL; @@ -16094,8 +15604,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->grad) { size_t offset; - GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->opt[0])); - memcpy(&offset, tensor->opt[0]->data, sizeof(offset)); + GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2])); + memcpy(&offset, tensor->src[2]->data, sizeof(offset)); size_t nb1 = tensor->nb[1]; size_t nb2 = tensor->nb[2]; @@ -16122,7 +15632,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - int32_t * axes = (int32_t *) tensor->opt[0]->data; + int32_t * axes = (int32_t *) tensor->src[2]->data; int axis0 = axes[0] & 0x3; int axis1 = axes[1] & 0x3; int axis2 = axes[2] & 0x3; @@ -16226,7 +15736,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + assert(ggml_nelements(src1) == 6); const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; @@ -16247,7 +15757,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { if (src0->grad) { assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 4); + assert(ggml_nelements(src1) == 3); const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; @@ -16266,30 +15776,42 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // noop } } break; - case GGML_OP_CONV_1D_S1_PH: + case GGML_OP_ALIBI: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CLAMP: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_1D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_2D: { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_1D_S2_PH: + case GGML_OP_POOL_1D: { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_2D_SK_P0: + case GGML_OP_POOL_2D: { GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_FLASH_ATTN: { struct ggml_tensor * flash_grad = NULL; - if (src0->grad || src1->grad || tensor->opt[0]->grad) { - int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); + if (src0->grad || src1->grad || tensor->src[2]->grad) { + int32_t t = ggml_get_i32_1d(tensor->src[3], 0); GGML_ASSERT(t == 0 || t == 1); bool masked = t != 0; flash_grad = ggml_flash_attn_back(ctx, src0, src1, - tensor->opt[0], + tensor->src[2], tensor->grad, masked); } @@ -16386,7 +15908,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor inplace); } - struct ggml_tensor * opt0 = tensor->opt[0]; + struct ggml_tensor * opt0 = tensor->src[2]; if (opt0->grad) { struct ggml_tensor * grad_v = NULL; @@ -16502,17 +16024,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } } - if (node->src0) { - ggml_visit_parents(cgraph, node->src0); - } - - if (node->src1) { - ggml_visit_parents(cgraph, node->src1); - } - - for (int i = 0; i < GGML_MAX_OPT; ++i) { - if (node->opt[i]) { - ggml_visit_parents(cgraph, node->opt[i]); + for (int i = 0; i < GGML_MAX_SRC; ++i) { + if (node->src[i]) { + ggml_visit_parents(cgraph, node->src[i]); } } @@ -16567,9 +16081,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.n_threads =*/ GGML_DEFAULT_N_THREADS, - /*.work_size =*/ 0, - /*.work =*/ NULL, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, @@ -16740,16 +16251,20 @@ void clear_numa_thread_affinity(void) {} #endif struct ggml_compute_state_shared { - struct ggml_cgraph * cgraph; + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; - int n_threads; + const int n_threads; // synchronization primitives atomic_int n_active; // num active threads atomic_int node_n; // active graph node + + bool (*abort_callback)(void * data); // abort ggml_graph_compute when true + void * abort_callback_data; }; struct ggml_compute_state { @@ -16769,14 +16284,22 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - struct ggml_cgraph * cgraph = state->shared->cgraph; - const int n_threads = state->shared->n_threads; + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; + + const int * n_tasks_arr = cplan->n_tasks; + const int n_threads = state->shared->n_threads; + set_numa_thread_affinity(state->ith, n_threads); int node_n = -1; while (true) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + state->shared->node_n += 1; + return (thread_ret_t) GGML_EXIT_ABORTED; + } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again @@ -16784,16 +16307,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; if (node_n != -1) { /* FINALIZE */ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; - params.nth = node->n_tasks; - ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.nth = n_tasks_arr[node_n]; + ggml_compute_forward(¶ms, node); + ggml_graph_compute_perf_stats_node(node, state->shared); + } } // distribute new work or execute it direct if 1T @@ -16801,27 +16326,37 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; state->shared->perf_node_start_cycles = ggml_perf_cycles(); state->shared->perf_node_start_time_us = ggml_perf_time_us(); + params.nth = n_tasks; + /* INIT */ - params.type = GGML_TASK_INIT; - params.nth = node->n_tasks; - ggml_compute_forward(¶ms, node); + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_INIT; + ggml_compute_forward(¶ms, node); + } - if (node->n_tasks == 1) { + if (n_tasks == 1) { // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) params.type = GGML_TASK_COMPUTE; ggml_compute_forward(¶ms, node); - params.type = GGML_TASK_FINALIZE; - ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_FINALIZE; + ggml_compute_forward(¶ms, node); + ggml_graph_compute_perf_stats_node(node, state->shared); + } } else { break; } + + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + break; + } } atomic_store(&state->shared->n_active, n_threads); @@ -16830,7 +16365,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // wait for other threads to finish const int last = node_n; do { - sched_yield(); + //sched_yield(); node_n = atomic_load(&state->shared->node_n); } while (node_n == last); } @@ -16840,382 +16375,398 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /* COMPUTE */ struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; struct ggml_compute_params params = { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, - /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.nth =*/ n_tasks, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; - if (state->ith < node->n_tasks) { + if (state->ith < n_tasks) { ggml_compute_forward(¶ms, node); } } - return 0; + return GGML_EXIT_SUCCESS; } -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - const int n_threads = cgraph->n_threads; +struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { + if (n_threads <= 0) { + n_threads = GGML_DEFAULT_N_THREADS; + } + + size_t work_size = 0; - struct ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.perf_node_start_cycles =*/ 0, - /*.perf_node_start_time_us =*/ 0, - /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, - /*.node_n =*/ -1, - }; - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + struct ggml_cplan cplan; + memset(&cplan, 0, sizeof(struct ggml_cplan)); - // initialize tasks + work buffer - { - size_t work_size = 0; + // thread scheduling for the different operations + work buffer size estimation + for (int i = 0; i < cgraph->n_nodes; i++) { + int n_tasks = 1; - // thread scheduling for the different operations - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; + struct ggml_tensor * node = cgraph->nodes[i]; - switch (node->op) { - case GGML_OP_CPY: - case GGML_OP_DUP: - { - node->n_tasks = n_threads; + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + { + n_tasks = n_threads; - size_t cur = 0; - if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; - } + size_t cur = 0; + if (ggml_is_quantized(node->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks; + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_ADD: - case GGML_OP_ADD1: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; - } + if (ggml_is_quantized(node->src[0]->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ACC: + { + n_tasks = n_threads; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_ACC: - { - node->n_tasks = n_threads; + size_t cur = 0; - size_t cur = 0; + if (ggml_is_quantized(node->src[0]->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks; + } - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; - } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_ABS: + case GGML_OP_SGN: + case GGML_OP_NEG: + case GGML_OP_STEP: + case GGML_OP_TANH: + case GGML_OP_ELU: + case GGML_OP_RELU: + { + n_tasks = 1; + } break; + case GGML_OP_MUL: + case GGML_OP_GELU: + case GGML_OP_GELU_QUICK: + case GGML_OP_SILU: + case GGML_OP_SILU_BACK: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: + { + n_tasks = n_threads; + + // TODO: use different scheduling for different matrix sizes + //const int nr0 = ggml_nrows(node->src[0]); + //const int nr1 = ggml_nrows(node->src[1]); + + //n_tasks = MIN(n_threads, MAX(1, nr0/128)); + //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); - work_size = MAX(work_size, cur); - } break; - case GGML_OP_SUB: - case GGML_OP_DIV: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_LOG: - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: - case GGML_OP_REPEAT: - case GGML_OP_REPEAT_BACK: - case GGML_OP_ABS: - case GGML_OP_SGN: - case GGML_OP_NEG: - case GGML_OP_STEP: - case GGML_OP_RELU: - { - node->n_tasks = 1; - } break; - case GGML_OP_MUL: - case GGML_OP_GELU: - case GGML_OP_GELU_QUICK: - case GGML_OP_SILU: - case GGML_OP_SILU_BACK: - case GGML_OP_NORM: - case GGML_OP_RMS_NORM: - case GGML_OP_RMS_NORM_BACK: - { - node->n_tasks = n_threads; - } break; - case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: - { - node->n_tasks = n_threads; - - // TODO: use different scheduling for different matrix sizes - //const int nr0 = ggml_nrows(node->src0); - //const int nr1 = ggml_nrows(node->src1); - - //node->n_tasks = MIN(n_threads, MAX(1, nr0/128)); - //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks); - - size_t cur = 0; + size_t cur = 0; + const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; #if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } - else + if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + } else #elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); - } - else -#endif - if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else { - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); - } -#else - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); -#endif - } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { - cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; - } + if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node); + } else #endif - } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else -#endif - { - const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type; - cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q]; - } - } else { - GGML_ASSERT(false); + if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + if (node->src[0]->type != GGML_TYPE_F32) { + // here we need memory just for single 2D matrix from src0 + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]); } + } else +#endif + if (node->src[1]->type != vec_dot_type) { + cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type]; + } else { + cur = 0; + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_SCALE: - { - node->n_tasks = 1; - } break; - case GGML_OP_SET: - case GGML_OP_CONT: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_GET_ROWS: - case GGML_OP_GET_ROWS_BACK: - case GGML_OP_DIAG: - case GGML_OP_DIAG_MASK_ZERO: - { - node->n_tasks = 1; - } break; - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: - case GGML_OP_SOFT_MAX_BACK: - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: - { - node->n_tasks = n_threads; - } break; - case GGML_OP_ALIBI: - { - node->n_tasks = 1; //TODO - } break; - case GGML_OP_CLAMP: - { - node->n_tasks = 1; //TODO - } break; - case GGML_OP_CONV_1D_S1_PH: - case GGML_OP_CONV_1D_S2_PH: - { - node->n_tasks = n_threads; - - GGML_ASSERT(node->src0->ne[3] == 1); - GGML_ASSERT(node->src1->ne[2] == 1); - GGML_ASSERT(node->src1->ne[3] == 1); - - size_t cur = 0; - const int nk = node->src0->ne[0]; - - if (node->src0->type == GGML_TYPE_F16 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else { - GGML_ASSERT(false); - } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SCALE: + { + n_tasks = 1; + } break; + case GGML_OP_SET: + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: + case GGML_OP_DIAG_MASK_ZERO: + { + n_tasks = 1; + } break; + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_ALIBI: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CLAMP: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CONV_1D: + { + n_tasks = n_threads; + + GGML_ASSERT(node->src[0]->ne[3] == 1); + GGML_ASSERT(node->src[1]->ne[2] == 1); + GGML_ASSERT(node->src[1]->ne[3] == 1); + + size_t cur = 0; + const int nk = node->src[0]->ne[0]; + + if (node->src[0]->type == GGML_TYPE_F16 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*( + nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + + ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] + ); + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*( + nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + + ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] + ); + } else { + GGML_ASSERT(false); + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CONV_2D_SK_P0: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_2D: + { + n_tasks = n_threads; + + const int64_t ne00 = node->src[0]->ne[0]; // W + const int64_t ne01 = node->src[0]->ne[1]; // H + const int64_t ne02 = node->src[0]->ne[2]; // C + const int64_t ne03 = node->src[0]->ne[3]; // N + + const int64_t ne10 = node->src[1]->ne[0]; // W + const int64_t ne11 = node->src[1]->ne[1]; // H + const int64_t ne12 = node->src[1]->ne[2]; // C + + const int64_t ne0 = node->ne[0]; + const int64_t ne1 = node->ne[1]; + const int64_t ne2 = node->ne[2]; + const int64_t nk = ne00*ne01; + const int64_t ew0 = nk * ne02; + + UNUSED(ne03); + UNUSED(ne2); + + size_t cur = 0; + + if (node->src[0]->type == GGML_TYPE_F16 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)* (ne10*ne11*ne12); + } else { + GGML_ASSERT(false); + } - GGML_ASSERT(node->src1->ne[3] == 1); + work_size = MAX(work_size, cur); + } break; + case GGML_OP_POOL_1D: + case GGML_OP_POOL_2D: + { + n_tasks = 1; + } break; + case GGML_OP_FLASH_ATTN: + { + n_tasks = n_threads; - const int64_t ne00 = node->src0->ne[0]; // W - const int64_t ne01 = node->src0->ne[1]; // H - const int64_t ne02 = node->src0->ne[2]; // C - const int64_t ne03 = node->src0->ne[3]; // N + size_t cur = 0; - const int64_t ne10 = node->src1->ne[0]; // W - const int64_t ne11 = node->src1->ne[1]; // H - const int64_t ne12 = node->src1->ne[2]; // C + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); + + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } - const int64_t nk = ne00*ne01; + if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } - UNUSED(ne02); - UNUSED(ne03); - UNUSED(nk); + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_FF: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - if (node->src0->type == GGML_TYPE_F16 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)* (ne10*ne11*ne12); - } else { - GGML_ASSERT(false); - } + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_ATTN: - { - node->n_tasks = n_threads; + if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 + } - size_t cur = 0; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + n_tasks = n_threads; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + size_t cur = 0; - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 - } + const int64_t D = node->src[0]->ne[0]; + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 - } + if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_FF: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM3: + { + n_tasks = 1; + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 - } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + n_tasks = n_threads; - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 - } + size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_ATTN_BACK: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_NONE: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; + } - size_t cur = 0; + cplan.n_tasks[i] = n_tasks; + } - const int64_t D = node->src0->ne[0]; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); - const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 - } + if (work_size > 0) { + work_size += CACHE_LINE_SIZE*(n_threads - 1); + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 - } + cplan.n_threads = n_threads; + cplan.work_size = work_size; + cplan.work_data = NULL; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_WIN_PART: - case GGML_OP_WIN_UNPART: - case GGML_OP_MAP_UNARY: - case GGML_OP_MAP_BINARY: - case GGML_OP_MAP_CUSTOM1: - case GGML_OP_MAP_CUSTOM2: - case GGML_OP_MAP_CUSTOM3: - { - node->n_tasks = 1; - } break; - case GGML_OP_CROSS_ENTROPY_LOSS: - { - node->n_tasks = n_threads; - - size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks); - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CROSS_ENTROPY_LOSS_BACK: - { - node->n_tasks = n_threads; - - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks; - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_NONE: - { - node->n_tasks = 1; - } break; - case GGML_OP_COUNT: - { - GGML_ASSERT(false); - } break; - } - } + return cplan; +} - if (cgraph->work != NULL && work_size > cgraph->work_size) { - GGML_ASSERT(false); // TODO: better handling - } +int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { + { + GGML_ASSERT(cplan); + GGML_ASSERT(cplan->n_threads > 0); - if (work_size > 0 && cgraph->work == NULL) { - cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } - GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); - cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (cgraph->nodes[i]->op != GGML_OP_NONE) { + GGML_ASSERT(cplan->n_tasks[i] > 0); + } } } + const int n_threads = cplan->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_plan =*/ cplan, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + /*.abort_callback =*/ NULL, + /*.abort_callback_data =*/ NULL, + }; + struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + // create thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; ++j) { @@ -17236,12 +16787,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int64_t perf_start_time_us = ggml_perf_time_us(); // this is a work thread too - ggml_graph_compute_thread(&workers[0]); + int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]); // don't leave affinity set on the main thread clear_numa_thread_affinity(); - // join thread pool + // join or kill thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; j++) { const int rc = ggml_thread_join(workers[j].thrd, NULL); @@ -17265,6 +16816,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) (double) perf_time_us_cur / 1000.0, (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs); } + + return compute_status; } void ggml_graph_reset(struct ggml_cgraph * cgraph) { @@ -17277,6 +16830,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } } +void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); + + struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size); + GGML_ASSERT(buf); + + cplan.work_data = buf->data; + + ggml_graph_compute(cgraph, &cplan); +} + struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * leaf = cgraph->leafs[i]; @@ -17315,14 +16879,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], - tensor->n_tasks, tensor->data, tensor->name); } @@ -17359,8 +16922,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { ggml_graph_export_leaf(cgraph->leafs[i], fout); GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); - GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); - GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); + GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL); + GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL); } // header @@ -17371,17 +16934,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { for (int i = 0; i < cgraph->n_nodes; ++i) { ggml_graph_export_node(cgraph->nodes[i], "DST", fout); - if (cgraph->nodes[i]->src0) { - ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); - } - - if (cgraph->nodes[i]->src1) { - ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); - } - - for (int j = 0; j < GGML_MAX_OPT; ++j) { - if (cgraph->nodes[i]->opt[j]) { - ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); + for (int j = 0; j < GGML_MAX_SRC; ++j) { + if (cgraph->nodes[i]->src[j]) { + ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout); } } @@ -17435,13 +16990,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { fwrite(&nb, sizeof(uint64_t), 1, fout); } - // store the pointer address - { - const uint64_t ptr = (uint64_t) tensor->data; - - fwrite(&ptr, sizeof(uint64_t), 1, fout); - } - fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); // dump the data @@ -17475,27 +17023,17 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { fwrite(&nb, sizeof(uint64_t), 1, fout); } - // store the pointer address - { - const uint64_t ptr = (uint64_t) tensor->data; - - fwrite(&ptr, sizeof(uint64_t), 1, fout); - } - fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); // output the op arguments { - struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; + struct ggml_tensor * args[GGML_MAX_SRC] = { NULL }; - args[0] = tensor->src0; - args[1] = tensor->src1; - - for (int j = 0; j < GGML_MAX_OPT; ++j) { - args[2 + j] = tensor->opt[j]; + for (int j = 0; j < GGML_MAX_SRC; ++j) { + args[j] = tensor->src[j]; } - for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + for (int j = 0; j < GGML_MAX_SRC; ++j) { if (args[j]) { int32_t idx = -1; @@ -17666,8 +17204,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** tensor->op = (enum ggml_op) op; - uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); - memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; tensor->data = (void *) ptr; @@ -17713,16 +17249,14 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** nb[j] = nb_cur; } - uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used - const char * ptr_name = ptr; ptr += GGML_MAX_NAME; - const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t); + const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t); - struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; + struct ggml_tensor * args[GGML_MAX_SRC] = { NULL }; // parse args - for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + for (int j = 0; j < GGML_MAX_SRC; ++j) { const int32_t arg_idx = ptr_arg_idx[j]; if (arg_idx == -1) { @@ -17779,11 +17313,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** tensor->nb[j] = nb[j]; } - tensor->src0 = args[0]; - tensor->src1 = args[1]; - - for (int j = 0; j < GGML_MAX_OPT; ++j) { - tensor->opt[j] = args[2 + j]; + for (int j = 0; j < GGML_MAX_SRC; ++j) { + tensor->src[j] = args[j]; } result.nodes[i] = tensor; @@ -17982,19 +17513,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; - if (node->src0) { - ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x"); - } - - if (node->src1) { - ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y"); - } - - for (int j = 0; j < GGML_MAX_OPT; j++) { - if (node->opt[j]) { + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j]) { char label[16]; - snprintf(label, sizeof(label), "opt %d", j); - ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label); + snprintf(label, sizeof(label), "src %d", j); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label); } } } @@ -18002,19 +17525,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph for (int i = 0; i < gb->n_leafs; i++) { struct ggml_tensor * node = gb->leafs[i]; - if (node->src0) { - ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x"); - } - - if (node->src1) { - ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y"); - } - - for (int j = 0; j < GGML_MAX_OPT; j++) { - if (node->opt[j]) { + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j]) { char label[16]; - snprintf(label, sizeof(label), "opt %d", j); - ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label); + snprintf(label, sizeof(label), "src %d", j); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label); } } } @@ -18076,9 +17591,6 @@ static enum ggml_opt_result ggml_opt_adam( struct ggml_cgraph * gb) { GGML_ASSERT(ggml_is_scalar(f)); - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - // these will store the parameters we want to optimize struct ggml_tensor * ps[GGML_MAX_PARAMS]; @@ -18125,7 +17637,8 @@ static enum ggml_opt_result ggml_opt_adam( // compute the function value ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -18205,7 +17718,8 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); const float fx = ggml_get_f32_1d(f, 0); @@ -18327,7 +17841,8 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params->n_threads); ggml_opt_get_grad(np, ps, g); @@ -18395,9 +17910,6 @@ static enum ggml_opt_result ggml_opt_lbfgs( } } - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - const int m = params.lbfgs.m; // these will store the parameters we want to optimize @@ -18449,7 +17961,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); ggml_opt_get_grad(np, ps, g); diff --git a/llmfarm_core.swift/Sources/llmfarm_core/ggml.h b/llmfarm_core.swift/Sources/llmfarm_core/ggml.h index 4599132..24856a2 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/ggml.h +++ b/llmfarm_core.swift/Sources/llmfarm_core/ggml.h @@ -65,7 +65,7 @@ // ggml_set_f32(a, 3.0f); // ggml_set_f32(b, 4.0f); // -// ggml_graph_compute(ctx0, &gf); +// ggml_graph_compute_with_ctx(ctx, &gf, n_threads); // // printf("f = %f\n", ggml_get_f32_1d(f, 0)); // @@ -132,10 +132,10 @@ // { // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); // -// // a[1, 2] = 1.0f; +// // a[2, 1] = 1.0f; // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; // -// // a[2, 0] = 2.0f; +// // a[0, 2] = 2.0f; // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; // // ... @@ -197,10 +197,17 @@ #define GGML_MAX_NODES 4096 #define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 -#define GGML_MAX_OPT 4 +#define GGML_MAX_SRC 6 #define GGML_MAX_NAME 48 #define GGML_DEFAULT_N_THREADS 4 + +#define GGML_EXIT_SUCCESS 0 +#define GGML_EXIT_ABORTED 1 + +#define GGML_UNUSED(x) (void)(x) + + #define GGML_ASSERT(x) \ do { \ if (!(x)) { \ @@ -209,6 +216,30 @@ } \ } while (0) +// used to copy the number of elements and stride in bytes of tensors into local variables. +// main purpose is to reduce code duplication and improve readability. +// +// example: +// +// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); +// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); +// +#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ + const type prefix##0 = (pointer)->array[0]; \ + GGML_UNUSED(prefix##0); +#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ + const type prefix##1 = (pointer)->array[1]; \ + GGML_UNUSED(prefix##1); +#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ + const type prefix##2 = (pointer)->array[2]; \ + GGML_UNUSED(prefix##2); +#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ + const type prefix##3 = (pointer)->array[3]; \ + GGML_UNUSED(prefix##3); + #ifdef __cplusplus extern "C" { #endif @@ -224,8 +255,8 @@ extern "C" { GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); - GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n); - GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n); + GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n); + GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n); struct ggml_object; struct ggml_context; @@ -295,12 +326,15 @@ extern "C" { GGML_OP_SUM, GGML_OP_SUM_ROWS, GGML_OP_MEAN, + GGML_OP_ARGMAX, GGML_OP_REPEAT, GGML_OP_REPEAT_BACK, GGML_OP_ABS, GGML_OP_SGN, GGML_OP_NEG, GGML_OP_STEP, + GGML_OP_TANH, + GGML_OP_ELU, GGML_OP_RELU, GGML_OP_GELU, GGML_OP_GELU_QUICK, @@ -332,9 +366,10 @@ extern "C" { GGML_OP_ROPE_BACK, GGML_OP_ALIBI, GGML_OP_CLAMP, - GGML_OP_CONV_1D_S1_PH, - GGML_OP_CONV_1D_S2_PH, - GGML_OP_CONV_2D_SK_P0, + GGML_OP_CONV_1D, + GGML_OP_CONV_2D, + GGML_OP_POOL_1D, + GGML_OP_POOL_2D, GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, @@ -386,12 +421,7 @@ extern "C" { bool is_param; struct ggml_tensor * grad; - struct ggml_tensor * src0; - struct ggml_tensor * src1; - struct ggml_tensor * opt[GGML_MAX_OPT]; - - // thread scheduling - int n_tasks; + struct ggml_tensor * src[GGML_MAX_SRC]; // performance int perf_runs; @@ -404,19 +434,31 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[4]; + char padding[8]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // the compute plan that needs to be prepared for ggml_graph_compute() + // since https://github.com/ggerganov/ggml/issues/287 + struct ggml_cplan { + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` + + int n_threads; + + // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes + int n_tasks[GGML_MAX_NODES]; + + // abort ggml_graph_compute when true + bool (*abort_callback)(void * data); + void * abort_callback_data; + }; + // computation graph struct ggml_cgraph { int n_nodes; int n_leafs; - int n_threads; - - size_t work_size; - struct ggml_tensor * work; struct ggml_tensor * nodes[GGML_MAX_NODES]; struct ggml_tensor * grads[GGML_MAX_NODES]; @@ -444,6 +486,9 @@ extern "C" { // compute types + + // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. + // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. enum ggml_task_type { GGML_TASK_INIT = 0, GGML_TASK_COMPUTE, @@ -687,6 +732,11 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // argmax along rows + GGML_API struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a); + // if a is the same shape as b, and a is not parameter, return a // otherwise, return a new tensor: repeat(a) to fit in b GGML_API struct ggml_tensor * ggml_repeat( @@ -731,6 +781,22 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_relu( struct ggml_context * ctx, struct ggml_tensor * a); @@ -1055,6 +1121,17 @@ extern "C" { int mode, int n_ctx); + // custom RoPE, in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float freq_base, + float freq_scale, + int n_ctx); + // rotary position embedding backward, i.e compute dx from dy // a - dy GGML_API struct ggml_tensor * ggml_rope_back( @@ -1081,58 +1158,58 @@ extern "C" { float min, float max); - // TODO: implement general-purpose convolutions - // GGML_API struct ggml_tensor * ggml_conv_1d( - // struct ggml_context * ctx, - // struct ggml_tensor * a, - // struct ggml_tensor * b, - // int s0 - // int p0, - // int d0); - // - // GGML_API struct ggml_tensor * ggml_conv_2d( - // struct ggml_context * ctx, - // struct ggml_tensor * a, - // struct ggml_tensor * b, - // int s0, - // int s1, - // int p0, - // int p1, - // int d0, - // int d1); - - // padding = half - // TODO: we don't support extra parameters for now - // that's why we are hard-coding the stride, padding, and dilation - // not great .. - // example: - // a: 3 80 768 1 - // b: 3000 80 1 1 - // res: 3000 768 1 1 - // used in whisper - GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph( + GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + int s0, // stride + int p0, // padding + int d0); // dilation - // used in whisper - GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph( + GGML_API struct ggml_tensor * ggml_conv_2d( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1); - // kernel size is a->ne[0] x a->ne[1] - // stride is equal to kernel size - // padding is zero - // example: - // a: 16 16 3 768 - // b: 1024 1024 3 1 - // res: 64 64 768 1 - // used in sam - GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( + // conv_1d with padding = half + // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) + GGML_API struct ggml_tensor* ggml_conv_1d_ph( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + int s, + int d); + + enum ggml_op_pool { + GGML_OP_POOL_MAX, + GGML_OP_POOL_AVG, + GGML_OP_POOL_COUNT, + }; + + GGML_API struct ggml_tensor* ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, // kernel size + int s0, // stride + int p0); // padding + + GGML_API struct ggml_tensor* ggml_pool_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + int p0, + int p1); GGML_API struct ggml_tensor * ggml_flash_attn( struct ggml_context * ctx, @@ -1263,15 +1340,22 @@ extern "C" { GGML_API void ggml_set_param( struct ggml_context * ctx, - struct ggml_tensor * tensor); + struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); - GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + + // same as ggml_graph_compute() but the work data is allocated as a part of the context + // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); @@ -1488,25 +1572,24 @@ extern "C" { // #ifdef __cplusplus - // restrict not standard in C++ +// restrict not standard in C++ #define GGML_RESTRICT #else #define GGML_RESTRICT restrict #endif - typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); - typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); - typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef struct { - dequantize_row_q_t dequantize_row_q; - quantize_row_q_t quantize_row_q; - quantize_row_q_t quantize_row_q_reference; - quantize_row_q_t quantize_row_q_dot; - vec_dot_q_t vec_dot_q; - enum ggml_type vec_dot_type; - } quantize_fns_t; - - quantize_fns_t ggml_internal_get_quantize_fn(size_t i); + ggml_to_float_t to_float; + ggml_from_float_t from_float; + ggml_from_float_t from_float_reference; + ggml_vec_dot_t vec_dot; + enum ggml_type vec_dot_type; + } ggml_type_traits_t; + + ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i); #ifdef __cplusplus } diff --git a/llmfarm_core.swift/Sources/llmfarm_core/gpt2/gpt2.cpp b/llmfarm_core.swift/Sources/llmfarm_core/gpt2/gpt2.cpp index 4f2188a..8b6de14 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/gpt2/gpt2.cpp +++ b/llmfarm_core.swift/Sources/llmfarm_core/gpt2/gpt2.cpp @@ -504,14 +504,13 @@ bool gpt2_eval( } struct ggml_init_params params = { - .mem_size = buf_size, - .mem_buffer = buf, - .no_alloc = false, + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, }; struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -756,7 +755,7 @@ bool gpt2_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); @@ -782,7 +781,6 @@ bool gpt2_eval( - struct gpt2_context * gpt2_init_from_file(const char * path_model, struct gpt_context_params params) { ggml_time_init(); diff --git a/llmfarm_core.swift/Sources/llmfarm_core/gptneox/gptneox.cpp b/llmfarm_core.swift/Sources/llmfarm_core/gptneox/gptneox.cpp index 55288c9..2dc9187 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/gptneox/gptneox.cpp +++ b/llmfarm_core.swift/Sources/llmfarm_core/gptneox/gptneox.cpp @@ -72,9 +72,27 @@ struct gpt_neox_hparams:gpt_base_hparams { int32_t ftype = 1; }; -struct gpt_neox_model: gpt_base_model { - gpt_neox_hparams hparams ; +struct gpt_neox_model { + gpt_neox_hparams hparams; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + + struct ggml_tensor * lmh_g; // language model head + //struct ggml_tensor * lmh_b; // language model bias + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; }; @@ -136,16 +154,6 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_ // } const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - { - switch (hparams.n_layer) { - case 16: model.type = e_model::MODEL_3B; break; - case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = e_model::MODEL_3B; break; - case 40: model.type = e_model::MODEL_7B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = e_model::MODEL_65B; break; - } - } printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); @@ -154,7 +162,6 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_ printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: n_rot = %d\n", __func__, hparams.n_rot); printf("%s: par_res = %d\n", __func__, hparams.par_res); - printf("%s: model_size = %s\n", __func__, gpt_model_type_name(model.type)); printf("%s: ftype = %d\n", __func__, hparams.ftype); printf("%s: qntvr = %d\n", __func__, qntvr); @@ -266,7 +273,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_ model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); //model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); // map by name @@ -275,7 +282,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_ model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g; model.tensors["gpt_neox.final_layer_norm.bias"] = model.ln_f_b; - model.tensors["embed_out.weight"] = model.lm_head; + model.tensors["embed_out.weight"] = model.lmh_g; //model.tensors["lm_head.bias"] = model.lmh_b; for (int i = 0; i < n_layer; ++i) { @@ -506,14 +513,13 @@ bool gpt_neox_eval( } struct ggml_init_params params = { - .mem_size = buf_size, - .mem_buffer = buf, - .no_alloc = false, + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, }; struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -671,7 +677,7 @@ bool gpt_neox_eval( // lm_head { - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); //inpL = ggml_add(ctx0, // ggml_repeat(ctx0, model.lmh_b, inpL), @@ -683,7 +689,7 @@ bool gpt_neox_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); @@ -706,7 +712,7 @@ bool gpt_neox_eval( return true; } -// + @@ -735,16 +741,16 @@ struct gpt_neox_context * gpt_neox_init_from_file(const char * path_model, struc // reserve memory for context buffers if (!params.vocab_only) { - if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { - fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); - gpt_neox_free(ctx); - return nullptr; - } - - { - const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); - fprintf(stderr, "%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0); - } +// if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { +// fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); +// gpt_neox_free(ctx); +// return nullptr; +// } +// +// { +// const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); +// fprintf(stderr, "%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0); +// } const auto & hparams = ctx->model.hparams; diff --git a/llmfarm_core.swift/Sources/llmfarm_core/k_quants.h b/llmfarm_core.swift/Sources/llmfarm_core/k_quants.h index 6abe3d7..adc6a39 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/k_quants.h +++ b/llmfarm_core.swift/Sources/llmfarm_core/k_quants.h @@ -15,6 +15,14 @@ #define K_SCALE_SIZE 12 #endif +#ifndef static_assert +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) +#define static_assert(cond, msg) _Static_assert(cond, msg) +#else +#define static_assert(cond, msg) struct global_scope_noop_trick +#endif +#endif + // // Super-block quantization structures // diff --git a/llmfarm_core.swift/Sources/llmfarm_core/llama/llama.cpp b/llmfarm_core.swift/Sources/llmfarm_core/llama/llama.cpp index 5341826..dc45918 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/llama/llama.cpp +++ b/llmfarm_core.swift/Sources/llmfarm_core/llama/llama.cpp @@ -17,7 +17,10 @@ #endif #ifdef GGML_USE_METAL -#include "ggml-metal.h" +#include "../ggml-metal.h" +#endif +#ifdef GGML_USE_MPI +#include "ggml-mpi.h" #endif #ifdef GGML_USE_K_QUANTS #ifndef QK_K @@ -66,6 +69,7 @@ enum e_model { MODEL_65B, }; +static const size_t kB = 1024; static const size_t MB = 1024*1024; // computed for n_ctx == 2048 @@ -78,14 +82,34 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } -static const std::map & MEM_REQ_SCRATCH0() +// +// ggml helpers +// + +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + +// +// memory sizes +// + +static const std::map & MEM_REQ_SCRATCH0(int n_ctx) { static std::map k_sizes = { - { MODEL_3B, 256ull * MB }, - { MODEL_7B, 512ull * MB }, - { MODEL_13B, 512ull * MB }, - { MODEL_30B, 512ull * MB }, - { MODEL_65B, 1024ull * MB }, + /* empirical scaling, still a guess */ + { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB }, + { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB }, + { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB }, + { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB }, + { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB }, }; return k_sizes; } @@ -117,14 +141,42 @@ static const std::map & MEM_REQ_KV_SELF() // this is mostly needed for temporary mul_mat buffers to dequantize the data // not actually needed if BLAS is disabled -static const std::map & MEM_REQ_EVAL() +static const std::map & MEM_REQ_EVAL(int n_ctx) +{ + static std::map k_sizes = { + { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB }, + { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB }, + { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB }, + { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB }, + { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB }, + }; + return k_sizes; +} + +// amount of VRAM needed per batch size to hold temporary results +// the values for 3b and 65b are not derived from testing but instead chosen conservatively +static const std::map & VRAM_REQ_SCRATCH_BASE() { static std::map k_sizes = { - { MODEL_3B, 512ull * MB }, - { MODEL_7B, 768ull * MB }, - { MODEL_13B, 1024ull * MB }, - { MODEL_30B, 1280ull * MB }, - { MODEL_65B, 1536ull * MB }, + { MODEL_3B, 512ull * kB }, + { MODEL_7B, 512ull * kB }, + { MODEL_13B, 640ull * kB }, + { MODEL_30B, 768ull * kB }, + { MODEL_65B, 1536ull * kB }, + }; + return k_sizes; +} + +// amount of VRAM needed per batch size and context to hold temporary results +// the values for 3b and 65b are not derived from testing but instead chosen conservatively +static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() +{ + static std::map k_sizes = { + { MODEL_3B, 128ull }, + { MODEL_7B, 128ull }, + { MODEL_13B, 160ull }, + { MODEL_30B, 208ull }, + { MODEL_65B, 416ull }, }; return k_sizes; } @@ -138,6 +190,10 @@ struct llama_hparams { uint32_t n_head = 32; uint32_t n_layer = 32; uint32_t n_rot = 64; + + float rope_freq_base = 10000.0f; + float rope_freq_scale = 1.0f; + enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; bool operator!=(const llama_hparams & other) const { @@ -165,8 +221,8 @@ struct llama_layer { }; struct llama_kv_cache { - struct ggml_tensor * k; - struct ggml_tensor * v; + struct ggml_tensor * k = NULL; + struct ggml_tensor * v = NULL; struct ggml_context * ctx = NULL; @@ -252,8 +308,7 @@ struct llama_model { }; struct llama_context { - llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} - + llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} #ifdef GGML_USE_METAL ~llama_context() { if (ctx_metal) { @@ -274,7 +329,6 @@ struct llama_context { int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) const llama_model & model; - const llama_vocab & vocab; bool model_owner = false; @@ -293,6 +347,9 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; + // reusable buffer for `struct ggml_graph_plan.work_data` + std::vector work_buffer; + // memory buffers used to evaluate the model // TODO: move in llama_state llama_ctx_buffer buf_compute; @@ -302,6 +359,10 @@ struct llama_context { ggml_metal_context * ctx_metal = NULL; #endif +#ifdef GGML_USE_MPI + ggml_mpi_context * ctx_mpi = NULL; +#endif + int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; @@ -371,34 +432,6 @@ static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml return size / ggml_blck_size(type); } -//void llama_shift_kv_cache(struct llama_context * ctx, int n) { -// auto & model = ctx->model; -// auto & kv_self = model.kv_self; -// auto & hparams = model.hparams; -// auto n_layer = hparams.n_layer; -// auto n_embd = hparams.n_embd; -// auto n_ctx = hparams.n_ctx; -// for(int il = 0; il < n_layer; il++) { -// // K: Embeddings are in regular order so moving them is easy as copying the memory -// { -// int elem_byte_size = ggml_element_size(kv_self.k); -// uint8_t * dst_ptr = ((uint8_t *)kv_self.k->data) + (elem_byte_size * n_embd * (il * n_ctx)); -// uint8_t * src_ptr = ((uint8_t *)kv_self.k->data) + (elem_byte_size * n_embd * (il * n_ctx + n)); -// memcpy(dst_ptr, src_ptr, elem_byte_size * n_embd * (n_ctx - n)); -// } -// -// // V: Embeddings are transposed so each embedding element must be copied separately -// { -// int elem_byte_size = ggml_element_size(kv_self.v); -// for(int i = 0; i < n_embd; i++) { -// uint8_t * dst_ptr = ((uint8_t *)kv_self.v->data) + (elem_byte_size * (il * n_ctx * i)); -// uint8_t * src_ptr = ((uint8_t *)kv_self.v->data) + (elem_byte_size * (il * n_ctx * i + n)); -// memcpy(dst_ptr, src_ptr, elem_byte_size * (n_ctx - n)); -// } -// } -// } -//} - struct llama_load_tensor { std::string name; enum ggml_type type = GGML_TYPE_F32; @@ -481,9 +514,7 @@ struct llama_file_loader { std::string word = file.read_string(len); float score = 0.0f; - if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) { - file.read_raw(&score, sizeof(score)); - } + file.read_raw(&score, sizeof(score)); vocab.token_to_id[word] = i; @@ -621,7 +652,7 @@ struct llama_model_loader { *ctx_size_p = *mmapped_size_p = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; - *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; + *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16; } } @@ -760,7 +791,6 @@ struct llama_model_loader { }; - // // kv cache // @@ -812,12 +842,14 @@ static bool kv_cache_init( struct llama_context_params llama_context_default_params() { struct llama_context_params result = { - /*.seed =*/ -1, + /*.seed =*/ LLAMA_DEFAULT_SEED, /*.n_ctx =*/ 512, /*.n_batch =*/ 512, /*.gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ {0}, + /*.rope_freq_base =*/ 10000.0f, + /*.rope_freq_scale =*/ 1.0f, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, @@ -851,7 +883,7 @@ bool llama_mlock_supported() { return llama_mlock::SUPPORTED; } -void llama_init_backend(bool numa) { +void llama_backend_init(bool numa) { ggml_time_init(); // needed to initialize f16 tables @@ -864,6 +896,16 @@ void llama_init_backend(bool numa) { if (numa) { ggml_numa_init(); } + +#ifdef GGML_USE_MPI + ggml_mpi_backend_init(); +#endif +} + +void llama_backend_free() { +#ifdef GGML_USE_MPI + ggml_mpi_backend_free(); +#endif } int64_t llama_time_us() { @@ -931,6 +973,8 @@ static void llama_model_load_internal( int n_gpu_layers, int main_gpu, const float * tensor_split, + float rope_freq_base, + float rope_freq_scale, bool low_vram, ggml_type memory_type, bool use_mmap, @@ -965,22 +1009,27 @@ static void llama_model_load_internal( } hparams.n_ctx = n_ctx; + + hparams.rope_freq_base = rope_freq_base; + hparams.rope_freq_scale = rope_freq_scale; } const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; { - fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); - fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); + fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); + fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); + fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); - fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); + fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { @@ -1129,9 +1178,9 @@ static void llama_model_load_internal( const size_t mem_required = ctx_size + mmapped_size - vram_weights + // weights in VRAM not in memory - MEM_REQ_SCRATCH0().at(model.type) + + MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL().at (model.type); + MEM_REQ_EVAL(hparams.n_ctx).at(model.type); // this is the memory required by one llama_state const size_t mem_required_state = @@ -1147,14 +1196,18 @@ static void llama_model_load_internal( fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); ggml_cuda_set_scratch_size(0); // disable scratch } else { - vram_scratch = n_batch * MB; + const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type); + const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type); + vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context); ggml_cuda_set_scratch_size(vram_scratch); if (n_gpu_layers > 0) { - fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n", - __func__, vram_scratch / MB); + fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n", + __func__, vram_scratch_base / kB, vram_scratch_per_context, + (vram_scratch + MB - 1) / MB); // round up } } #endif // GGML_USE_CUBLAS + #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -1163,6 +1216,10 @@ static void llama_model_load_internal( fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__); } size_t vram_kv_cache = 0; + +#ifdef GGML_USE_CUBLAS + const int max_backend_supported_layers = hparams.n_layer + 3; + const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; if (n_gpu_layers > (int) hparams.n_layer + 1) { if (low_vram) { fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); @@ -1179,14 +1236,18 @@ static void llama_model_load_internal( vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2; } } - const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; +#elif defined(GGML_USE_CLBLAST) + const int max_backend_supported_layers = hparams.n_layer + 1; + const int max_offloadable_layers = hparams.n_layer + 1; +#endif // GGML_USE_CUBLAS + fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n", - __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3); + __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); fprintf(stderr, "%s: total VRAM used: %zu MB\n", __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up #else (void) n_gpu_layers; -#endif +#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) } // populate `tensors_by_name` @@ -1223,6 +1284,8 @@ static bool llama_model_load( int n_gpu_layers, int main_gpu, float * tensor_split, + float rope_freq_base, + float rope_freq_scale, bool low_vram, ggml_type memory_type, bool use_mmap, @@ -1231,7 +1294,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -1253,18 +1316,16 @@ static bool llama_eval_internal( llama_context & lctx, const llama_token * tokens, const float * embd, - const int n_tokens, - const int n_past, - const int n_threads, + int n_tokens, + int n_past, + int n_threads, const char * cgraph_fname) { LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); - // enforce that the first token is BOS - if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) { - fprintf(stderr, "%s: first token must be BOS\n", __func__); - return false; - } +#ifdef GGML_USE_MPI + ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); +#endif const int64_t t_start_us = ggml_time_us(); @@ -1285,6 +1346,9 @@ static bool llama_eval_internal( const int n_rot = hparams.n_embd/hparams.n_head; const int n_gpu_layers = model.n_gpu_layers; + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; @@ -1296,20 +1360,26 @@ static bool llama_eval_internal( struct ggml_context * ctx0 = ggml_init(params); + ggml_cgraph gf = {}; + // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; struct ggml_tensor * cur; struct ggml_tensor * inpL; if (tokens) { - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_set_name(embd, "embd"); - memcpy(embd->data, tokens, N*ggml_element_size(embd)); - inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + ggml_set_name(inp_tokens, "inp_tokens"); + + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); } @@ -1327,18 +1397,20 @@ static bool llama_eval_internal( offload_func_t offload_func_v = llama_nop; #ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers; - } + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers; + } + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers; + } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers; + } #endif // GGML_USE_CUBLAS for (int il = 0; il < n_layer; ++il) { + ggml_format_name(inpL, "layer_inp_%d", il); + offload_func_t offload_func = llama_nop; #ifdef GGML_USE_CUBLAS @@ -1374,11 +1446,11 @@ static bool llama_eval_internal( offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); + struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); + struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); @@ -1545,7 +1617,6 @@ static bool llama_eval_internal( // input for next layer inpL = cur; - } lctx.use_buf(ctx0, 0); @@ -1553,7 +1624,6 @@ static bool llama_eval_internal( // used at the end to optionally extract the embeddings struct ggml_tensor * embeddings = NULL; - // norm { cur = ggml_rms_norm(ctx0, inpL); @@ -1568,7 +1638,6 @@ static bool llama_eval_internal( embeddings = cur; } - // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); ggml_set_name(cur, "result_output"); @@ -1581,8 +1650,13 @@ static bool llama_eval_internal( // run the computation ggml_build_forward_expand(&gf, cur); +#if GGML_USE_MPI + ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer); +#endif + #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { + ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); } else { @@ -1602,12 +1676,21 @@ static bool llama_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); } - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); } #else - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); +#endif + +#if GGML_USE_MPI + ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer); #endif + // update kv token count + lctx.kv_self.n = n_past + N; + + struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1]; + if (cgraph_fname) { ggml_graph_export(&gf, cgraph_fname); } @@ -1623,23 +1706,17 @@ static bool llama_eval_internal( // ggml_graph_dump_dot(&gf, NULL, "llama.dot"); //} - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N); - - // update kv token count - lctx.kv_self.n = n_past + N; - // extract logits { auto & logits_out = lctx.logits; if (lctx.logits_all) { logits_out.resize(n_vocab * N); - memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N); + memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N); } else { // return result for just the last token logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab); } } @@ -1895,10 +1972,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can return; } - const int64_t t_start_sample_us = ggml_time_us(); - llama_sample_softmax(ctx, candidates); + const int64_t t_start_sample_us = ggml_time_us(); + // Compute the cumulative probabilities float cum_sum = 0.0f; size_t last_idx = candidates->size; @@ -1927,9 +2004,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * return; } - const int64_t t_start_sample_us = ggml_time_us(); - llama_sample_softmax(nullptr, candidates); + const int64_t t_start_sample_us = ggml_time_us(); // Compute the first and second derivatives std::vector first_derivatives(candidates->size - 1); @@ -1981,11 +2057,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c return; } - const int64_t t_start_sample_us = ggml_time_us(); - // Compute the softmax of logits and calculate entropy llama_sample_softmax(nullptr, candidates); + const int64_t t_start_sample_us = ggml_time_us(); + float entropy = 0.0f; for (size_t i = 0; i < candidates->size; ++i) { entropy += -candidates->data[i].p * logf(candidates->data[i].p); @@ -2109,6 +2185,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l } } +static void llama_log_softmax(float * array, size_t size) { + float max_l = *std::max_element(array, array + size); + float sum = 0.f; + for (size_t i = 0; i < size; ++i) { + float p = expf(array[i] - max_l); + sum += p; + array[i] = p; + } + + for (size_t i = 0; i < size; ++i) { + array[i] = logf(array[i] / sum); + } +} + +void llama_sample_classifier_free_guidance( + struct llama_context * ctx, + llama_token_data_array * candidates, + struct llama_context * guidance_ctx, + float scale, + float smooth_factor) { + int64_t t_start_sample_us = t_start_sample_us = ggml_time_us(); + + assert(ctx); + auto n_vocab = llama_n_vocab(ctx); + assert(n_vocab == (int)candidates->size); + assert(!candidates->sorted); + + std::vector logits_base; + logits_base.reserve(candidates->size); + for (size_t i = 0; i < candidates->size; ++i) { + logits_base.push_back(candidates->data[i].logit); + } + llama_log_softmax(logits_base.data(), candidates->size); + + float* logits_guidance = llama_get_logits(guidance_ctx); + llama_log_softmax(logits_guidance, n_vocab); + + for (int i = 0; i < n_vocab; ++i) { + float logit_guidance = logits_guidance[i]; + float logit_base = logits_base[i]; + logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance; + } + + llama_log_softmax(logits_guidance, n_vocab); + + for (int i = 0; i < n_vocab; ++i) { + float logit_base = logits_base[i]; + float logit_guidance = logits_guidance[i]; + + candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base; + } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { assert(ctx); @@ -2154,13 +2286,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_ if (ctx) { ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - ctx->n_sample++; } return X; } llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) { - assert(ctx); int64_t t_start_sample_us; t_start_sample_us = ggml_time_us(); @@ -2175,13 +2305,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok candidates->size = 1; } + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + // Normalize the probabilities of the remaining words llama_sample_softmax(ctx, candidates); // Sample the next word X from the remaining words - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - } llama_token X = llama_sample_token(ctx, candidates); t_start_sample_us = ggml_time_us(); @@ -2249,10 +2380,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam } float * f32_output = (float *) output.addr; - quantize_fns_t qtype; + ggml_type_traits_t qtype; if (ggml_is_quantized(tensor.type)) { - qtype = ggml_internal_get_quantize_fn(tensor.type); - if (qtype.dequantize_row_q == NULL) { + qtype = ggml_internal_get_type_traits(tensor.type); + if (qtype.to_float == NULL) { throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); } } else if (tensor.type != GGML_TYPE_F16) { @@ -2263,7 +2394,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam if (tensor.type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements); } else if (ggml_is_quantized(tensor.type)) { - qtype.dequantize_row_q(tensor.data, f32_output, nelements); + qtype.to_float(tensor.data, f32_output, nelements); } else { LLAMA_ASSERT(false); // unreachable } @@ -2288,7 +2419,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam if (typ == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); } else { - qtype.dequantize_row_q(inbuf, outbuf, nels); + qtype.to_float(inbuf, outbuf, nels); } }; workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); @@ -2397,15 +2528,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS + bool convert_incompatible_tensor = false; if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K || quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) { int nx = tensor.ne.at(0); int ny = tensor.ne.at(1); if (nx % QK_K != 0 || ny % QK_K != 0) { - fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K); - fprintf(stderr, "This is required to be able to use k-quants for now!\n"); - fprintf(stderr, "========================================================================================\n\n"); - throw std::runtime_error("Unsupported tensor size encountered\n"); + fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); + convert_incompatible_tensor = true; } } if (tensor.name == "output.weight") { @@ -2433,6 +2563,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } + if (convert_incompatible_tensor) { + if (tensor.name == "output.weight") { + new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. + fprintf(stderr, "F16 will be used for this tensor instead.\n"); + } else if (tensor.name == "tok_embeddings.weight") { + new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. + fprintf(stderr, "Q4_0 will be used for this tensor instead.\n"); + } else { + throw std::runtime_error("Unsupported tensor size encountered\n"); + } + } #endif float * f32_data; @@ -2552,8 +2693,9 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, - params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { + params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram, + memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, + params.progress_callback_user_data)) { delete model; fprintf(stderr, "%s: failed to load model\n", __func__); return nullptr; @@ -2567,16 +2709,16 @@ void llama_free_model(struct llama_model * model) { } struct llama_context * llama_new_context_with_model( - struct llama_model * model, - struct llama_context_params params) { + struct llama_model * model, + struct llama_context_params params) { if (!model) { return nullptr; } - llama_context * ctx = new llama_context(*model, model->vocab); + llama_context * ctx = new llama_context(*model); - if (params.seed < 0) { + if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); } @@ -2628,16 +2770,16 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); + ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type)); - ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); } #ifdef GGML_USE_METAL if (params.n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers - ctx->ctx_metal = ggml_metal_init(); + ctx->ctx_metal = ggml_metal_init(1); void * data_ptr = NULL; size_t data_size = 0; @@ -2672,6 +2814,18 @@ struct llama_context * llama_new_context_with_model( } #endif +#ifdef GGML_USE_MPI + ctx->ctx_mpi = ggml_mpi_init(); + + if (ggml_mpi_rank(ctx->ctx_mpi) > 0) { + // Enter a blocking eval loop with dummy input, letting rank=0 drive the process + const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos()); + while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; + llama_backend_free(); + exit(1); + } +#endif + return ctx; } @@ -2794,6 +2948,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // read tensors and apply bool warned = false; int n_tensors = 0; + + std::vector work_buffer; + while (true) { int32_t n_dims; int32_t length; @@ -2958,8 +3115,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const } struct ggml_cgraph gf = ggml_build_forward(r); - gf.n_threads = n_threads; - ggml_graph_compute(lora_ctx, &gf); + + ggml_graph_compute_helper(work_buffer, &gf, n_threads); // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -3009,8 +3166,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) { #define LLAMA_MAX_RNG_STATE (64*1024) -void llama_set_rng_seed(struct llama_context * ctx, int seed) { - if (seed < 0) { +void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) { + if (seed == LLAMA_DEFAULT_SEED) { seed = time(NULL); } ctx->rng.seed(seed); @@ -3112,7 +3269,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; - gf.n_threads = 1; ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kout3d->data = out; @@ -3132,7 +3288,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute(cpy_ctx, &gf); + ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } @@ -3218,7 +3374,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; - gf.n_threads = 1; ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kin3d->data = (void *) inp; @@ -3238,7 +3393,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute(cpy_ctx, &gf); + ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } @@ -3254,7 +3409,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { return nread; } -bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { llama_file file(path_session, "rb"); // sanity checks @@ -3308,6 +3463,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi return true; } +bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + try { + return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); + } catch (const std::exception & err) { + fprintf(stderr, "error loading session file: %s\n", err.what()); + return false; + } +} + bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { llama_file file(path_session, "wb"); @@ -3390,13 +3554,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) { return 0; } -int llama_tokenize( - struct llama_context * ctx, +int llama_tokenize_with_model( + const struct llama_model * model, const char * text, llama_token * tokens, int n_max_tokens, bool add_bos) { - auto res = llama_tokenize(ctx->vocab, text, add_bos); + auto res = llama_tokenize(model->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { fprintf(stderr, "%s: too many tokens\n", __func__); @@ -3410,8 +3574,29 @@ int llama_tokenize( return res.size(); } +int llama_tokenize( + struct llama_context * ctx, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos) { + return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); +} + +int llama_n_vocab_from_model(const struct llama_model * model) { + return model->vocab.id_to_token.size(); +} + +int llama_n_ctx_from_model(const struct llama_model * model) { + return model->hparams.n_ctx; +} + +int llama_n_embd_from_model(const struct llama_model * model) { + return model->hparams.n_embd; +} + int llama_n_vocab(const struct llama_context * ctx) { - return ctx->vocab.id_to_token.size(); + return ctx->model.vocab.id_to_token.size(); } int llama_n_ctx(const struct llama_context * ctx) { @@ -3422,19 +3607,27 @@ int llama_n_embd(const struct llama_context * ctx) { return ctx->model.hparams.n_embd; } -int llama_get_vocab( - const struct llama_context * ctx, +int llama_get_vocab_from_model( + const struct llama_model * model, const char * * strings, float * scores, int capacity) { - int n = std::min(capacity, (int) ctx->vocab.id_to_token.size()); + int n = std::min(capacity, (int) model->vocab.id_to_token.size()); for (int i = 0; ivocab.id_to_token[i].tok.c_str(); - scores[i] = ctx->vocab.id_to_token[i].score; + strings[i] = model->vocab.id_to_token[i].tok.c_str(); + scores[i] = model->vocab.id_to_token[i].score; } return n; } +int llama_get_vocab( + const struct llama_context * ctx, + const char * * strings, + float * scores, + int capacity) { + return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity); +} + float * llama_get_logits(struct llama_context * ctx) { return ctx->logits.data(); } @@ -3443,12 +3636,16 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { - if (token >= llama_n_vocab(ctx)) { +const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) { + if (token >= llama_n_vocab_from_model(model)) { return nullptr; } - return ctx->vocab.id_to_token[token].tok.c_str(); + return model->vocab.id_to_token[token].tok.c_str(); +} + +const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { + return llama_token_to_str_with_model(&ctx->model, token); } llama_token llama_token_bos() { @@ -3463,23 +3660,35 @@ llama_token llama_token_nl() { return 13; } +struct llama_timings llama_get_timings(struct llama_context * ctx) { + struct llama_timings result = { + /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, + /*.t_end_ms =*/ 1.00 * ggml_time_ms(), + /*.t_load_ms =*/ 1e-3 * ctx->t_load_us, + /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us, + /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us, + /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, -void llama_print_timings(struct llama_context * ctx) { - const int64_t t_end_us = ggml_time_us(); + /*.n_sample =*/ std::max(1, ctx->n_sample), + /*.n_p_eval =*/ std::max(1, ctx->n_p_eval), + /*.n_eval =*/ std::max(1, ctx->n_eval), + }; - const int32_t n_sample = std::max(1, ctx->n_sample); - const int32_t n_eval = std::max(1, ctx->n_eval); - const int32_t n_p_eval = std::max(1, ctx->n_p_eval); + return result; +} + +void llama_print_timings(struct llama_context * ctx) { + const llama_timings timings = llama_get_timings(ctx); fprintf(stderr, "\n"); - fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); + fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample); + __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval); + __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval); - fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0); + __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); + fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); } void llama_reset_timings(struct llama_context * ctx) { diff --git a/llmfarm_core.swift/Sources/llmfarm_core/replit/replit.cpp b/llmfarm_core.swift/Sources/llmfarm_core/replit/replit.cpp index ce18896..4ba5b0c 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/replit/replit.cpp +++ b/llmfarm_core.swift/Sources/llmfarm_core/replit/replit.cpp @@ -494,7 +494,6 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd)); @@ -633,7 +632,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); // std::cout << "Qcur" << std::endl; // print_tensor(Qcur); @@ -665,7 +664,6 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa - struct replit_context * replit_init_from_file(const char * path_model, struct gpt_context_params params) { ggml_time_init(); diff --git a/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/gpt_spm.h b/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/gpt_spm.h index 302b93c..cf61efd 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/gpt_spm.h +++ b/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/gpt_spm.h @@ -31,7 +31,7 @@ typedef void (*gpt_progress_callback)(float progress, void *ctx); struct gpt_context_params { int n_ctx; // text context int n_parts; // -1 for default - int seed; // RNG seed, 0 for random + uint32_t seed; // RNG seed, 0 for random bool f16_kv; // use fp16 for KV cache bool logits_all; // the gptneox_eval() call computes all logits, not just the last one diff --git a/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/llama.h b/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/llama.h index 704eca6..fb4d923 100644 --- a/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/llama.h +++ b/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/llama.h @@ -46,6 +46,8 @@ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 1 +#define LLAMA_DEFAULT_SEED 0xFFFFFFFF + #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. #define LLAMA_SUPPORTS_GPU_OFFLOAD @@ -63,7 +65,7 @@ extern "C" { struct llama_model; struct llama_context; - + typedef int llama_token; typedef struct llama_token_data { @@ -81,12 +83,17 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); struct llama_context_params { - int seed; // RNG seed, -1 for random - int n_ctx; // text context - int n_batch; // prompt processing batch size - int n_gpu_layers; // number of layers to store in VRAM - int main_gpu; // the GPU that is used for scratch and small tensors + uint32_t seed; // RNG seed, -1 for random + int32_t n_ctx; // text context + int32_t n_batch; // prompt processing batch size + int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t main_gpu; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + + // ref: https://github.com/ggerganov/llama.cpp/pull/2054 + float rope_freq_base; // RoPE base frequency + float rope_freq_scale; // RoPE frequency scaling factor + // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; // context pointer passed to the progress callback @@ -132,6 +139,20 @@ extern "C" { bool quantize_output_tensor; // quantize output.weight } llama_model_quantize_params; + // performance timing information + struct llama_timings { + double t_start_ms; + double t_end_ms; + double t_load_ms; + double t_sample_ms; + double t_p_eval_ms; + double t_eval_ms; + + int32_t n_sample; + int32_t n_p_eval; + int32_t n_eval; + }; + LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); @@ -142,9 +163,9 @@ extern "C" { // Initialize the llama + ggml backend // If numa is true, use NUMA optimizations // Call once at the start of the program - LLAMA_API void llama_init_backend(bool numa); - -// LLAMA_API void llama_shift_kv_cache(struct llama_context * ctx, int n); + LLAMA_API void llama_backend_init(bool numa); + // Call once at the end of the program - currently only used for MPI + LLAMA_API void llama_backend_free(); LLAMA_API int64_t llama_time_us(); @@ -198,7 +219,7 @@ extern "C" { LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); // Sets the current rng seed. - LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); + LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed); // Returns the maximum size in bytes of the state (rng, logits, embedding // and kv_cache) - will often be smaller after compacting tokens @@ -254,10 +275,21 @@ extern "C" { int n_max_tokens, bool add_bos); + LLAMA_API int llama_tokenize_with_model( + const struct llama_model * model, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos); + LLAMA_API int llama_n_vocab(const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx); + LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); + LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); + LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); + // Get the vocabulary as output parameters. // Returns number of results. LLAMA_API int llama_get_vocab( @@ -266,6 +298,12 @@ extern "C" { float * scores, int capacity); + LLAMA_API int llama_get_vocab_from_model( + const struct llama_model * model, + const char * * strings, + float * scores, + int capacity); + // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row // Can be mutated in order to change the probabilities of the next token @@ -278,7 +316,13 @@ extern "C" { LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); // Token Id -> String. Uses the vocabulary in the provided context - LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); + LLAMA_API const char * llama_token_to_str( + const struct llama_context * ctx, + llama_token token); + + LLAMA_API const char * llama_token_to_str_with_model( + const struct llama_model * model, + llama_token token); // Special tokens LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence @@ -293,6 +337,18 @@ extern "C" { /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); + /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 + /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. + /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. + /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. + /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits. + LLAMA_API void llama_sample_classifier_free_guidance( + struct llama_context * ctx, + llama_token_data_array * candidates, + struct llama_context * guidance_ctx, + float scale, + float smooth_factor); + /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); @@ -331,6 +387,7 @@ extern "C" { LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); // Performance information + LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx); LLAMA_API void llama_reset_timings(struct llama_context * ctx); diff --git a/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/starcoder.h b/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/starcoder.h new file mode 100644 index 0000000..80452c4 --- /dev/null +++ b/llmfarm_core.swift/Sources/llmfarm_core/spm-headers/starcoder.h @@ -0,0 +1,44 @@ +#ifndef starcoder_H +#define starcoder_H + +#include +#include +#include + + + +#ifdef __cplusplus +extern "C" { +#endif + + + +typedef int starcoder_token; + +struct gpt_base_context; +struct starcoder_context; + + +void starcoder_free(struct starcoder_context * ctx); + + +struct gpt_context_params; + +struct starcoder_context * starcoder_init_from_file(const char * path_model, struct gpt_context_params params); + + +int starcoder_init_logits(struct starcoder_context * ctx,int n_threads); + +int starcoder_eval( + struct starcoder_context * ctx, + const starcoder_token * tokens, + int n_tokens, + int n_past, + int n_threads); +#ifdef __cplusplus +} +#endif + + + +#endif // starcoder diff --git a/llmfarm_core.swift/Sources/llmfarm_core/starcoder/main.cpp b/llmfarm_core.swift/Sources/llmfarm_core/starcoder/main.cpp new file mode 100644 index 0000000..d84e366 --- /dev/null +++ b/llmfarm_core.swift/Sources/llmfarm_core/starcoder/main.cpp @@ -0,0 +1,927 @@ +#include "ggml/ggml.h" + +#include "common.h" +#include "common-ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +// default hparams (GPT-2 117M) +// https://huggingface.co/bigcode/gpt_bigcode-santacoder/blob/main/config.json +struct starcoder_hparams { + int32_t n_vocab = 49280; + int32_t n_ctx = 2048; + int32_t n_embd = 2048; + int32_t n_head = 16; + int32_t n_layer = 24; + int32_t ftype = 1; +}; + +struct starcoder_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // attention + struct ggml_tensor * c_attn_attn_w; + struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // mlp + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; +}; + +struct starcoder_model { + starcoder_hparams hparams; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + struct ggml_tensor * wpe; // token embedding + struct ggml_tensor * lm_head; // language model head + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; +}; + +// load the model's weights from a file +bool starcoder_model_load(const std::string & fname, starcoder_model & model, gpt_vocab & vocab) { + printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + int32_t n_vocab = 0; + fin.read((char *) &n_vocab, sizeof(n_vocab)); + + if (n_vocab != model.hparams.n_vocab) { + fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", + __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); + return false; + } + + std::string word; + std::vector buf(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + buf.resize(len); + fin.read((char *) buf.data(), len); + word.assign(buf.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + + // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); + } + + // Add StarChat special tokens. + for (const std::string & token : { + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|end|>", + "", + "", + "", + "", + "<|end_of_turn|>" + }) { + if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) { + vocab.add_special_token(token); + } + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return false; + } + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + const int head_dim = n_embd / hparams.n_head; + const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head + const int kv_dim = kv_heads * head_dim; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + + ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w // TODO: + ctx_size += n_layer*( (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + + ctx_size += (6 + 12*n_layer)*512; // object overhead + + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + const int head_dim = n_embd / hparams.n_head; + const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head + const int kv_dim = kv_heads * head_dim; + + model.layers.resize(n_layer); + + model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); + model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + // map by name + model.tensors["model/ln_f/g"] = model.ln_f_g; + model.tensors["model/ln_f/b"] = model.ln_f_b; + + model.tensors["model/wte"] = model.wte; + model.tensors["model/wpe"] = model.wpe; + model.tensors["model/lm_head"] = model.lm_head; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd + 2*kv_dim); + layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd + 2*kv_dim); + + layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner + layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // map by name + model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; + model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; + + model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; + model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; + } + } + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + // load weights + { + size_t total_size = 0; + + bool has_lm_head = false; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name.data()) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return false; + } + + auto tensor = model.tensors[name.data()]; + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return false; + } + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n", + __func__, name.data(), (int) ggml_nelements(tensor), nelements); + return false; + } + + // for debugging + if (0) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + + // GPT-2 models share the WTE tensor as the LM head + if (name == "model/wte" && has_lm_head == false) { + memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); + } + + if (name == "model/lm_head") { + has_lm_head = true; + } + + total_size += ggml_nbytes(tensor); + } + + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + + fin.close(); + + return true; +} + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool starcoder_eval( + const starcoder_model & model, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w, + size_t & mem_per_token) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + const int n_vocab = hparams.n_vocab; + + static size_t buf_size = 256u*1024*1024; + static void * buf = malloc(buf_size); + + // use 2 scratch buffers + // TODO: very hacky solution - reimplement in a more elegant way + static size_t scr0_size = 256u*1024*1024; + static void * scr0 = malloc(scr0_size); + + static size_t scr1_size = 256u*1024*1024; + static void * scr1 = malloc(scr1_size); + + if (mem_per_token > 0 && mem_per_token*N > buf_size) { + const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead + //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + + // reallocate + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } + } + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph gf = {}; + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i; + } + + // wte + wpe + struct ggml_tensor * inpL = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.wte, embd), + ggml_get_rows(ctx0, model.wpe, position)); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + + // norm + { + // [ 768, N] + cur = ggml_norm(ctx0, inpL); + + // cur = ln_1_g*cur + ln_1_b + // [ 768, N] + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + } + + // attn + // [2304, 768] - model.layers[il].c_attn_attn_w + // [2304, 1] - model.layers[il].c_attn_attn_b + // [ 768, N] - cur (in) + // [2304, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [2304, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_attn_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), + cur); + } + + // self-attention + { + struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + 0, 2, 1, 3); + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); //TODO: need to be tiled + + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + struct ggml_tensor * V_trans = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + n_embd/n_head, n_head, n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); + + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + } + + // projection + // [ 768, 768] - model.layers[il].c_attn_proj_w + // [ 768, 1] - model.layers[il].c_attn_proj_b + // [ 768, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), + cur); + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + + struct ggml_tensor * inpFF = cur; + + ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); + + // feed-forward network + { + // norm + { + cur = ggml_norm(ctx0, inpFF); + + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), + cur); + + // GELU activation + // [3072, N] + cur = ggml_gelu(ctx0, cur); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), + cur); + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + } + + ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL); + + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL), + ggml_repeat(ctx0, model.ln_f_b, inpL)); + } + + ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + + // inpL = WTE * inpL + // [ 768, 50257] - model.lm_head + // [ 768, N] - inpL + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + + // logits -> probs + //inpL = ggml_soft_max_inplace(ctx0, inpL); + + // run the computation + ggml_build_forward_expand(&gf, inpL); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + + //if (n_past%100 == 0) { + // ggml_graph_print (&gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result just for the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024)); + + ggml_free(ctx0); + + return true; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + params.model = "models/gpt-2-117M/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + if (params.seed < 0) { + params.seed = time(NULL); + } + + printf("%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.prompt.empty()) { + params.prompt = gpt_random_prompt(rng); + } + + int64_t t_load_us = 0; + + gpt_vocab vocab; + starcoder_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!starcoder_model_load(params.model, model, vocab)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, params.token_test); + } + + if (params.repeat_last_n == -1) { + params.repeat_last_n = model.hparams.n_ctx; + } + printf("\n"); + printf("%s: temp = %.3f\n", __func__, params.temp); + printf("%s: top_k = %d\n", __func__, params.top_k); + printf("%s: top_p = %.3f\n", __func__, params.top_p); + printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); + printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + std::vector last_n_tokens(model.hparams.n_ctx); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + + // tokenize the prompt + std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); + + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + + printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < embd_inp.size(); i++) { + printf("%s: token[%d] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); + } + printf("\n\n"); + + // Handle StarChat "<|end|>" and OpenCoder "<|end_of_turn>" tokens. + gpt_vocab::id starchat_end_token = -1; + { + const auto it = vocab.token_to_id.find("<|end|>"); + if (it != vocab.token_to_id.end()) { + starchat_end_token = it->second; + } else { + const auto eot_token_id = vocab.token_to_id.find("<|end_of_turn|>"); + if (eot_token_id != vocab.token_to_id.end()) { + starchat_end_token = eot_token_id->second; + } + } + } + + // submit the input prompt token-by-token + // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning + std::vector embd; + + // determine the required inference memory per token: + size_t mem_per_token = 0; + starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + + for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!starcoder_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { + printf("Failed to predict\n"); + return 1; + } + + t_predict_us += ggml_time_us() - t_start_us; + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + const int top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + + const int n_vocab = model.hparams.n_vocab; + + gpt_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = gpt_sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, params.repeat_last_n, params.repeat_penalty, rng); + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (int k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(embd_inp[k]); + + if (embd.size() >= params.n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + printf("%s", vocab.id_to_token[id].c_str()); + } + fflush(stdout); + + // check if model is santacoder + if (model.hparams.n_layer <= 30 && embd.back() == 49152) { + break; + } + // check if model is starcoder + else if (embd.back() == 0) { //TODO: this is only for starcoder + break; + } + // Handle StarChat "<|end|>" token. + else if (embd.back() == starchat_end_token && i >= embd_inp.size()) { + break; + } + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); + printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + ggml_free(model.ctx); + + return 0; +} diff --git a/llmfarm_core.swift/Sources/llmfarm_core/starcoder/starcoder-mmap.cpp b/llmfarm_core.swift/Sources/llmfarm_core/starcoder/starcoder-mmap.cpp new file mode 100644 index 0000000..094c441 --- /dev/null +++ b/llmfarm_core.swift/Sources/llmfarm_core/starcoder/starcoder-mmap.cpp @@ -0,0 +1,1124 @@ +#include "ggml/ggml.h" + +#include "common.h" +#include "common-ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// mmap +#include +#include +#include +#include + +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_CLBLAST +#include "ggml-opencl.h" +#endif + +// default hparams (GPT-2 117M) +// https://huggingface.co/bigcode/gpt_bigcode-santacoder/blob/main/config.json +struct starcoder_hparams { + int32_t n_vocab = 49280; + int32_t n_ctx = 2048; + int32_t n_embd = 2048; + int32_t n_head = 16; + int32_t n_layer = 24; + int32_t ftype = 1; +}; + +struct starcoder_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // attention + struct ggml_tensor * c_attn_attn_w; + struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // mlp + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; +}; + +struct llama_buffer { + uint8_t * addr = NULL; + size_t size = 0; + + llama_buffer() = default; + + void resize(size_t len) { +#ifdef GGML_USE_METAL + free(addr); + int result = posix_memalign((void **) &addr, getpagesize(), len); + if (result == 0) { + memset(addr, 0, len); + } + else { + addr = NULL; + } +#else + delete[] addr; + addr = new uint8_t[len]; +#endif + size = len; + } + + ~llama_buffer() { +#ifdef GGML_USE_METAL + free(addr); +#else + delete[] addr; +#endif + addr = NULL; + } + + // disable copy and move + llama_buffer(const llama_buffer&) = delete; + llama_buffer(llama_buffer&&) = delete; + llama_buffer& operator=(const llama_buffer&) = delete; + llama_buffer& operator=(llama_buffer&&) = delete; +}; + + +struct kv_cache { + struct ggml_tensor * k; + struct ggml_tensor * v; + + struct ggml_context * ctx = NULL; + + //std::vector buf; + llama_buffer buf; + + int n; +}; + +struct starcoder_model { + starcoder_hparams hparams; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + struct ggml_tensor * wpe; // token embedding + struct ggml_tensor * lm_head; // language model head + + std::vector layers; + + // key + value memory + //struct ggml_tensor * memory_k; + //struct ggml_tensor * memory_v; + struct kv_cache cache; + + // model memory mapped file + void * mm_addr = NULL; + uint64_t mm_length = 0; + + // + struct ggml_context * ctx; + std::map tensors; +}; + +// From PR #613 (https://github.com/ggerganov/llama.cpp/pull/613) +static void *mmap_file(const char *fname, uint64_t *mm_length) { +#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) + HANDLE hFile = CreateFileA(fname, + GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED, + NULL); + if (hFile == INVALID_HANDLE_VALUE) return 0; + LARGE_INTEGER fileSize; + fileSize.QuadPart = -1; + GetFileSizeEx(hFile, &fileSize); + int64_t length = fileSize.QuadPart; + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + CloseHandle(hFile); + if (!hMapping) return 0; + void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + CloseHandle(hMapping); + if (!addr) return 0; +#else + int fd = open(fname, O_RDONLY); + if (fd == -1) return 0; + int64_t length = lseek(fd, 0, SEEK_END); + void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) return 0; +#endif + *mm_length = length; + return addr; +} + +static void munmap_file(void * addr, size_t length) { +#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES) + UnmapViewOfFile(addr); +#else + munmap(addr, length); +#endif +} + +// load the model's weights from a file +bool starcoder_model_load(const std::string & fname, starcoder_model & model, gpt_vocab & vocab, int32_t n_gpu_layers) { + printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + std::vector f_buf(1024*1024); + fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); + + fin.seekg(0, fin.end); + const size_t file_size = fin.tellg(); + fin.seekg(0); + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + //if (magic != 0x67676a74) { + if (magic != 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + int32_t n_vocab = 0; + fin.read((char *) &n_vocab, sizeof(n_vocab)); + + if (n_vocab != model.hparams.n_vocab) { + fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", + __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); + return false; + } + + std::string word; + std::vector buf(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + buf.resize(len); + fin.read((char *) buf.data(), len); + word.assign(buf.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + + // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); + } + + // Add StarChat special tokens. + for (const std::string & token : { + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|end|>", + }) { + if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) { + vocab.add_special_token(token); + } + } + } + + char *mm_addr = NULL; + model.mm_addr = mmap_file(fname.c_str(), &model.mm_length); + if (model.mm_addr == NULL) { + fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str()); + return false; + } + mm_addr = (char *)model.mm_addr; + fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0)); + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return false; + } + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + const int head_dim = n_embd / hparams.n_head; + const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head + const int kv_dim = kv_heads * head_dim; + + + /* + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + + ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w // TODO: + ctx_size += n_layer*( (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + */ + + ctx_size += (6 + 12*n_layer)*512; // object overhead + + //printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + const int head_dim = n_embd / hparams.n_head; + const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head + const int kv_dim = kv_heads * head_dim; + + model.layers.resize(n_layer); + + model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); + model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + // map by name + model.tensors["model/ln_f/g"] = model.ln_f_g; + model.tensors["model/ln_f/b"] = model.ln_f_b; + + model.tensors["model/wte"] = model.wte; + model.tensors["model/wpe"] = model.wpe; + model.tensors["model/lm_head"] = model.lm_head; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd + 2*kv_dim); + layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd + 2*kv_dim); + + layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner + layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // map by name + model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; + model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; + + model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; + model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; + } + } + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.cache.buf.resize(2u*n_elements*ggml_type_size(GGML_TYPE_F16) + 2u*1024*1024); + + struct ggml_init_params c_params; + c_params.mem_size = model.cache.buf.size; + c_params.mem_buffer = model.cache.buf.addr; + c_params.no_alloc = false; + + model.cache.ctx = ggml_init(c_params); + + if (!model.cache.ctx) { + fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); + return false; + } + + model.cache.k = ggml_new_tensor_1d(model.cache.ctx, GGML_TYPE_F16, n_elements); + model.cache.v = ggml_new_tensor_1d(model.cache.ctx, GGML_TYPE_F16, n_elements); + + const size_t memory_size = ggml_nbytes(model.cache.k) + ggml_nbytes(model.cache.v); + + printf("%s: kv_cache memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + // load weights + { + size_t total_size = 0; + + bool has_lm_head = false; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name.data()) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return false; + } + + auto tensor = model.tensors[name.data()]; + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return false; + } + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n", + __func__, name.data(), (int) ggml_nelements(tensor), nelements); + return false; + } + + // for debugging + if (0) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + // mmap + size_t offset = fin.tellg(); + size_t tensor_data_size = ggml_nbytes(tensor); + //offset = (offset + 31) & -32; + tensor->data = mm_addr + offset; + fin.seekg(offset + tensor_data_size); + total_size += tensor_data_size; + + // GPT-2 models share the WTE tensor as the LM head + if (name == "model/wte" && has_lm_head == false) { + // Dont know if this is required, test models have an lm_head + model.lm_head->data = tensor->data; + } + + if (name == "model/lm_head") { + has_lm_head = true; + } + } + + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + + fin.close(); + +#ifdef GGML_USE_CUBLAS + { + const auto & hparams = model.hparams; + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu); + + size_t vram_total = 0; + + for (int i = 0; i < n_gpu; ++i) { + const auto & layer = model.layers[i]; + + layer.c_attn_attn_w->backend = GGML_BACKEND_GPU; + ggml_cuda_transform_tensor((uint8_t *)layer.c_attn_attn_w->data, layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); + + layer.c_attn_proj_w->backend = GGML_BACKEND_GPU; + ggml_cuda_transform_tensor((uint8_t *)layer.c_attn_proj_w->data, layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); + + layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU; + ggml_cuda_transform_tensor((uint8_t *)layer.c_mlp_fc_w->data, layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); + + layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU; + ggml_cuda_transform_tensor((uint8_t *)layer.c_mlp_proj_w->data, layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); + } + + ggml_cuda_set_scratch_size(0); // disable scratch + + //if (n_gpu_layers > (int) hparams.n_layer) { + // fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__); + // ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); + //} + + fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } +#elif defined(GGML_USE_CLBLAST) + //From koboldcpp + { + const auto & hparams = model.hparams; + size_t vram_total = 0; + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + for (int i = 0; i < n_gpu; ++i) { + const auto & layer = model.layers[i]; + layer.c_attn_attn_w->backend = GGML_BACKEND_GPU; + layer.c_attn_proj_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU; + layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU; + ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); + ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); + ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); + ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); + } + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } + #endif + + return true; +} + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool starcoder_eval( + const starcoder_model & model, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w, + size_t & mem_per_token) { + + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + auto & cache = model.cache; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + const int n_vocab = hparams.n_vocab; + + // Scratch is too small for large n_batch (256) + //static size_t buf_size = 256u*1024*1024; + static size_t buf_size = 256u*1024*1024*2; + static void * buf = malloc(buf_size); + + // use 2 scratch buffers + // TODO: very hacky solution - reimplement in a more elegant way + static size_t scr0_size = 256u*1024*1024*2; + static void * scr0 = malloc(scr0_size); + + static size_t scr1_size = 256u*1024*1024*2; + static void * scr1 = malloc(scr1_size); + + if (mem_per_token > 0 && mem_per_token*N > buf_size) { + const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead + printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + + // reallocate + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } + } + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph gf = {}; + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + + + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i; + } + + // wte + wpe + struct ggml_tensor * inpL = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.wte, embd), + ggml_get_rows(ctx0, model.wpe, position)); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + + // norm + { + // [ 768, N] + cur = ggml_norm(ctx0, inpL); + + // cur = ln_1_g*cur + ln_1_b + // [ 768, N] + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + } + + // attn + // [2304, 768] - model.layers[il].c_attn_attn_w + // [2304, 1] - model.layers[il].c_attn_attn_b + // [ 768, N] - cur (in) + // [2304, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [2304, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_attn_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), + cur); + } + + // self-attention + { + struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, cache.k, N*n_embd, (ggml_element_size(cache.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx0, cache.v, N*n_embd, (ggml_element_size(cache.v)*n_embd)*(il*n_ctx + n_past)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + 0, 2, 1, 3); + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, cache.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(cache.k)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); //TODO: need to be tiled + + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + struct ggml_tensor * V_trans = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, cache.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(cache.v)*n_embd), + n_embd/n_head, n_head, n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, cache.v->type, n_past + N, n_embd/n_head, n_head)); + + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + } + + // projection + // [ 768, 768] - model.layers[il].c_attn_proj_w + // [ 768, 1] - model.layers[il].c_attn_proj_b + // [ 768, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), + cur); + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + + struct ggml_tensor * inpFF = cur; + + ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); + + // feed-forward network + { + // norm + { + cur = ggml_norm(ctx0, inpFF); + + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), + cur); + + // GELU activation + // [3072, N] + cur = ggml_gelu(ctx0, cur); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), + cur); + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + } + + ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL); + + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL), + ggml_repeat(ctx0, model.ln_f_b, inpL)); + } + + ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + + // inpL = WTE * inpL + // [ 768, 50257] - model.lm_head + // [ 768, N] - inpL + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + + // logits -> probs + //inpL = ggml_soft_max_inplace(ctx0, inpL); + + // run the computation + ggml_build_forward_expand(&gf, inpL); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + + //if (n_past%100 == 0) { + // ggml_graph_print (&gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result just for the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024)); + + ggml_free(ctx0); + + return true; +} + + +int main(int argc, char ** argv) { + ggml_time_init(); + + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + params.model = "models/gpt-2-117M/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + if (params.seed < 0) { + params.seed = time(NULL); + } + + printf("%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.prompt.empty()) { + params.prompt = gpt_random_prompt(rng); + } + + int64_t t_load_us = 0; + + gpt_vocab vocab; + starcoder_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!starcoder_model_load(params.model, model, vocab, params.n_gpu_layers)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, params.token_test); + } + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + // tokenize the prompt + std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); + + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + + printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < embd_inp.size(); i++) { + printf("%s: token[%d] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); + } + printf("\n\n"); + + // Handle StarChat "<|end|>" token. + gpt_vocab::id starchat_end_token = -1; + { + const auto it = vocab.token_to_id.find("<|end|>"); + if (it != vocab.token_to_id.end()) { + starchat_end_token = it->second; + } + } + + // submit the input prompt token-by-token + // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning + std::vector embd; + + // determine the required inference memory per token: + size_t mem_per_token = 0; + printf("Calling starcoder_eval\n"); + starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + + for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!starcoder_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { + printf("Failed to predict\n"); + return 1; + } + + // Should input processing count towards t_predict? + if (i > embd_inp.size()) { + t_predict_us += ggml_time_us() - t_start_us; + } + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + const int top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + + const int n_vocab = model.hparams.n_vocab; + + gpt_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); + + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (int k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + if (embd.size() >= params.n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + printf("%s", vocab.id_to_token[id].c_str()); + } + fflush(stdout); + + // check if model is santacoder + if (model.hparams.n_layer <= 30 && embd.back() == 49152) { + break; + } + // check if model is starcoder + else if (embd.back() == 0) { //TODO: this is only for starcoder + break; + } + // Handle StarChat "<|end|>" token. + else if (embd.back() == starchat_end_token) { + //break; + } + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); + printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + //Shouldnt the input prompt be subracted? + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(n_past - embd_inp.size())); + //printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); + + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + ggml_free(model.ctx); + + if (model.mm_addr) { + munmap_file(model.mm_addr, model.mm_length); + } + + return 0; +} diff --git a/llmfarm_core.swift/Sources/llmfarm_core/starcoder/starcoder.cpp b/llmfarm_core.swift/Sources/llmfarm_core/starcoder/starcoder.cpp new file mode 100644 index 0000000..7ac708d --- /dev/null +++ b/llmfarm_core.swift/Sources/llmfarm_core/starcoder/starcoder.cpp @@ -0,0 +1,1035 @@ +#include "../spm-headers/starcoder.h" +#include "../gpt_helpers.h" +#include "../spm-headers/gpt_spm.h" + +#include "../ggml.h" + +#include "../common.h" +#include "../common-ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +// default hparams (GPT-2 117M) +// https://huggingface.co/bigcode/gpt_bigcode-santacoder/blob/main/config.json +struct starcoder_hparams:gpt_base_hparams { + int32_t n_vocab = 49280; + int32_t n_ctx = 2048; + int32_t n_embd = 2048; + int32_t n_head = 16; + int32_t n_layer = 24; + int32_t ftype = 1; +}; + +struct starcoder_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // attention + struct ggml_tensor * c_attn_attn_w; + struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // mlp + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; +}; + +struct starcoder_model { + starcoder_hparams hparams; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + struct ggml_tensor * wpe; // token embedding + struct ggml_tensor * lm_head; // language model head + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; +}; + + +struct starcoder_context:gpt_base_context { + starcoder_model model; +}; + +void starcoder_free(struct starcoder_context * ctx) { + delete ctx; +} + +// load the model's weights from a file +bool starcoder_model_load(const std::string & fname, starcoder_model & model, gpt_vocab & vocab) { + printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + int32_t n_vocab = 0; + fin.read((char *) &n_vocab, sizeof(n_vocab)); + + if (n_vocab != model.hparams.n_vocab) { + fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", + __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); + return false; + } + + std::string word; + std::vector buf(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + buf.resize(len); + fin.read((char *) buf.data(), len); + word.assign(buf.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + + // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); + } + + // Add StarChat special tokens. + for (const std::string & token : { + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|end|>", + "", + "", + "", + "", + "<|end_of_turn|>" + }) { + if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) { + vocab.add_special_token(token); + } + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return false; + } + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + const int head_dim = n_embd / hparams.n_head; + const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head + const int kv_dim = kv_heads * head_dim; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + + ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w // TODO: + ctx_size += n_layer*( (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + + ctx_size += (6 + 12*n_layer)*512; // object overhead + + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + const int head_dim = n_embd / hparams.n_head; + const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head + const int kv_dim = kv_heads * head_dim; + + model.layers.resize(n_layer); + + model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); + model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + // map by name + model.tensors["model/ln_f/g"] = model.ln_f_g; + model.tensors["model/ln_f/b"] = model.ln_f_b; + + model.tensors["model/wte"] = model.wte; + model.tensors["model/wpe"] = model.wpe; + model.tensors["model/lm_head"] = model.lm_head; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd + 2*kv_dim); + layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd + 2*kv_dim); + + layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner + layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // map by name + model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; + model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; + + model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; + model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; + } + } + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + // load weights + { + size_t total_size = 0; + + bool has_lm_head = false; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name.data()) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return false; + } + + auto tensor = model.tensors[name.data()]; + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return false; + } + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n", + __func__, name.data(), (int) ggml_nelements(tensor), nelements); + return false; + } + + // for debugging + if (0) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + + // GPT-2 models share the WTE tensor as the LM head + if (name == "model/wte" && has_lm_head == false) { + memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); + } + + if (name == "model/lm_head") { + has_lm_head = true; + } + + total_size += ggml_nbytes(tensor); + } + + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + + fin.close(); + + return true; +} + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool starcoder_eval( + const starcoder_model & model, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w, + size_t & mem_per_token) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + const int n_vocab = hparams.n_vocab; + + static size_t buf_size = 256u*1024*1024; + static void * buf = malloc(buf_size); + + // use 2 scratch buffers + // TODO: very hacky solution - reimplement in a more elegant way + static size_t scr0_size = 256u*1024*1024; + static void * scr0 = malloc(scr0_size); + + static size_t scr1_size = 256u*1024*1024; + static void * scr1 = malloc(scr1_size); + + if (mem_per_token > 0 && mem_per_token*N > buf_size) { + const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead + //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + + // reallocate + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } + } + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph gf = {}; + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i; + } + + // wte + wpe + struct ggml_tensor * inpL = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.wte, embd), + ggml_get_rows(ctx0, model.wpe, position)); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + + // norm + { + // [ 768, N] + cur = ggml_norm(ctx0, inpL); + + // cur = ln_1_g*cur + ln_1_b + // [ 768, N] + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + } + + // attn + // [2304, 768] - model.layers[il].c_attn_attn_w + // [2304, 1] - model.layers[il].c_attn_attn_b + // [ 768, N] - cur (in) + // [2304, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [2304, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_attn_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), + cur); + } + + // self-attention + { + struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + 0, 2, 1, 3); + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); //TODO: need to be tiled + + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + struct ggml_tensor * V_trans = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + n_embd/n_head, n_head, n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); + + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + } + + // projection + // [ 768, 768] - model.layers[il].c_attn_proj_w + // [ 768, 1] - model.layers[il].c_attn_proj_b + // [ 768, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), + cur); + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + + struct ggml_tensor * inpFF = cur; + + ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); + + // feed-forward network + { + // norm + { + cur = ggml_norm(ctx0, inpFF); + + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), + cur); + + // GELU activation + // [3072, N] + cur = ggml_gelu(ctx0, cur); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), + cur); + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + } + + ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL); + + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL), + ggml_repeat(ctx0, model.ln_f_b, inpL)); + } + + ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + + // inpL = WTE * inpL + // [ 768, 50257] - model.lm_head + // [ 768, N] - inpL + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + + // logits -> probs + //inpL = ggml_soft_max_inplace(ctx0, inpL); + + // run the computation + ggml_build_forward_expand(&gf, inpL); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + + //if (n_past%100 == 0) { + // ggml_graph_print (&gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result just for the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024)); + + ggml_free(ctx0); + + return true; +} + + +struct starcoder_context * starcoder_init_from_file(const char * path_model, struct gpt_context_params params) { + ggml_time_init(); + + starcoder_context * ctx = new starcoder_context; + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + ctx->rng = std::mt19937(params.seed); + ctx->logits_all = params.logits_all; + + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + + if (!starcoder_model_load(path_model, ctx->model, ctx->vocab)) { + fprintf(stderr, "%s: failed to load model\n", __func__); + delete ctx; + return nullptr; + } + + + // reserve memory for context buffers + if (!params.vocab_only) { +// if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { +// fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); +// delete ctx; +// return nullptr; +// } +// +// { +// const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); +// fprintf(stderr, "%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0); +// } + + const auto & hparams = ctx->model.hparams; + + // resized during inference + if (params.logits_all) { + ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); + } else { + ctx->logits.reserve(hparams.n_vocab); + } + + if (params.embedding){ + ctx->embedding.resize(hparams.n_embd); + } + +// ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); +// +// ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); +// ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); + } + + return ctx; +} + + +int starcoder_init_logits(struct starcoder_context * ctx,int n_threads){ + size_t mem_per_token = 0; + if (!starcoder_eval(ctx->model, n_threads, 0, { 0, 1, 2, 3 }, ctx->logits, mem_per_token)) { + fprintf(stderr, "%s: failed to eval\n", __func__); + return 1; + } + return 0; +} + +int starcoder_eval( + struct starcoder_context * ctx, + const starcoder_token * tokens, + int n_tokens, + int n_past, + int n_threads) { + +// std::vector logits; + std::vector embd; + for (int i=0;imodel, n_threads, 0, { 0, 1, 2, 3 }, ctx->logits, mem_per_token); + // if (!gptneox_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) { + if (!starcoder_eval(ctx->model, n_threads, n_past, embd, ctx->logits, mem_per_token)) { + fprintf(stderr, "%s: failed to eval\n", __func__); + return 1; + } + // get a more accurate load time, upon first eval + if (!ctx->has_evaluated_once) { + ctx->t_load_us = ggml_time_us() - ctx->t_start_us; + ctx->has_evaluated_once = true; + } + return 0; +} + +// +//int test_run(int argc, char ** argv) { +// ggml_time_init(); +// +// const int64_t t_main_start_us = ggml_time_us(); +// +// gpt_params params; +// params.model = "models/gpt-2-117M/ggml-model.bin"; +// +// if (gpt_params_parse(argc, argv, params) == false) { +// return 1; +// } +// +// if (params.seed < 0) { +// params.seed = time(NULL); +// } +// +// printf("%s: seed = %d\n", __func__, params.seed); +// +// std::mt19937 rng(params.seed); +// if (params.prompt.empty()) { +// params.prompt = gpt_random_prompt(rng); +// } +// +// int64_t t_load_us = 0; +// +// gpt_vocab vocab; +// starcoder_model model; +// +// // load the model +// { +// const int64_t t_start_us = ggml_time_us(); +// +// if (!starcoder_model_load(params.model, model, vocab)) { +// fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); +// return 1; +// } +// +// t_load_us = ggml_time_us() - t_start_us; +// +// test_gpt_tokenizer(vocab, params.token_test); +// } +// +// if (params.repeat_last_n == -1) { +// params.repeat_last_n = model.hparams.n_ctx; +// } +// printf("\n"); +// printf("%s: temp = %.3f\n", __func__, params.temp); +// printf("%s: top_k = %d\n", __func__, params.top_k); +// printf("%s: top_p = %.3f\n", __func__, params.top_p); +// printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); +// printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); +// +// int n_past = 0; +// +// int64_t t_sample_us = 0; +// int64_t t_predict_us = 0; +// +// std::vector logits; +// +// std::vector last_n_tokens(model.hparams.n_ctx); +// std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); +// +// // tokenize the prompt +// std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); +// +// params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); +// +// printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); +// printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); +// for (int i = 0; i < embd_inp.size(); i++) { +// printf("%s: token[%d] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); +// } +// printf("\n\n"); +// +// // Handle StarChat "<|end|>" and OpenCoder "<|end_of_turn>" tokens. +// gpt_vocab::id starchat_end_token = -1; +// { +// const auto it = vocab.token_to_id.find("<|end|>"); +// if (it != vocab.token_to_id.end()) { +// starchat_end_token = it->second; +// } else { +// const auto eot_token_id = vocab.token_to_id.find("<|end_of_turn|>"); +// if (eot_token_id != vocab.token_to_id.end()) { +// starchat_end_token = eot_token_id->second; +// } +// } +// } +// +// // submit the input prompt token-by-token +// // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning +// std::vector embd; +// +// // determine the required inference memory per token: +// size_t mem_per_token = 0; +// starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); +// +// for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { +// // predict +// if (embd.size() > 0) { +// const int64_t t_start_us = ggml_time_us(); +// +// if (!starcoder_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { +// printf("Failed to predict\n"); +// return 1; +// } +// +// t_predict_us += ggml_time_us() - t_start_us; +// } +// +// n_past += embd.size(); +// embd.clear(); +// +// if (i >= embd_inp.size()) { +// // sample next token +// const int top_k = params.top_k; +// const float top_p = params.top_p; +// const float temp = params.temp; +// +// const int n_vocab = model.hparams.n_vocab; +// +// gpt_vocab::id id = 0; +// +// { +// const int64_t t_start_sample_us = ggml_time_us(); +// +// id = gpt_sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, params.repeat_last_n, params.repeat_penalty, rng); +// t_sample_us += ggml_time_us() - t_start_sample_us; +// } +// +// // add it to the context +// embd.push_back(id); +// +// last_n_tokens.erase(last_n_tokens.begin()); +// last_n_tokens.push_back(id); +// } else { +// // if here, it means we are still processing the input prompt +// for (int k = i; k < embd_inp.size(); k++) { +// embd.push_back(embd_inp[k]); +// +// last_n_tokens.erase(last_n_tokens.begin()); +// last_n_tokens.push_back(embd_inp[k]); +// +// if (embd.size() >= params.n_batch) { +// break; +// } +// } +// i += embd.size() - 1; +// } +// +// // display text +// for (auto id : embd) { +// printf("%s", vocab.id_to_token[id].c_str()); +// } +// fflush(stdout); +// +// // check if model is santacoder +// if (model.hparams.n_layer <= 30 && embd.back() == 49152) { +// break; +// } +// // check if model is starcoder +// else if (embd.back() == 0) { //TODO: this is only for starcoder +// break; +// } +// // Handle StarChat "<|end|>" token. +// else if (embd.back() == starchat_end_token && i >= embd_inp.size()) { +// break; +// } +// } +// +// // report timing +// { +// const int64_t t_main_end_us = ggml_time_us(); +// +// printf("\n\n"); +// printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); +// printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); +// printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); +// printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); +// printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); +// } +// +// ggml_free(model.ctx); +// +// return 0; +//} diff --git a/metal/ggml-metal.metal b/metal/ggml-metal.metal index e62fe68..9f9a4fb 100644 --- a/metal/ggml-metal.metal +++ b/metal/ggml-metal.metal @@ -365,6 +365,10 @@ kernel void kernel_rms_norm( } } +// putting them in the kernel cause a significant performance penalty +#define N_DST 4 // each SIMD group works on 4 rows +#define N_SIMDGROUP 2 // number of SIMD groups in a thread group +#define N_SIMDWIDTH 32 // assuming SIMD group size is 32 kernel void kernel_mul_mat_q4_0_f32( device const void * src0, device const float * src1, @@ -372,64 +376,83 @@ kernel void kernel_mul_mat_q4_0_f32( constant int64_t & ne00, constant int64_t & ne10, constant int64_t & ne0, - threadgroup float * sum [[threadgroup(0)]], + constant int64_t & ne01[[buffer(4)]], uint2 tgpig[[threadgroup_position_in_grid]], - uint2 tpitg[[thread_position_in_threadgroup]], - uint2 tptg[[threads_per_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK4_0; - - const int64_t r0 = tgpig.x; - const int64_t r1 = tgpig.y; - - device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb; device const float * y = (device const float *) src1 + r1*ne10; + block_q4_0 qb_curr, qb_next; + float4 y_curr[8]; // src1 vector cache + float sumf[N_DST]={0.f}, all_sum; + thread float * yl=(thread float *)y_curr; + + // bootstrap + qb_curr = x[tiisg]; + // each thread in a SIMD group deals with 1 block. + for (int column = 0; column < nb / N_SIMDWIDTH; column++) { + + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; + } + sumy *= (-8.f); - const int nth = tptg.x*tptg.y; - const int ith = tptg.y*tpitg.x + tpitg.y; - - const int ix = tpitg.y/4; // 0 or 1 - const int iy = tpitg.y - 4*ix; // 0...3 - - const int first = 4 * iy; - - float sumf = 0; - - for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) { - - const float d = (float)x[i].d; - - device const uint8_t * xl = x[i].qs + first; - device const float * yl = y + i * QK4_0 + first; - - float2 acc = {0.0f, 0.0f}; + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - for (int j = 0; j < 4; ++j) { + // calculate + float d = qb_curr.d; + float acc = sumy; + for (int i = 0; i < 16; i++) { + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + } + sumf[row] += d * acc; + qb_curr = qb_next; + } + } - acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4); - acc[1] += yl[j] + yl[j+16]; + if (nb % N_SIMDWIDTH == 0) { + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } + } + } else { + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; } + sumy *= (-8.f); - sumf += d * (acc[0] - 8.f*acc[1]); - } + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - sum[ith] = sumf; + // calculate + float d = qb_curr.d; + float acc = sumy; + for (int i = 0; i < 16; i++) { + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + } + if (tiisg < nb % N_SIMDWIDTH) { + sumf[row] += d * acc; + } + qb_curr = qb_next; - // - // Accumulate the sum from all threads in the threadgroup - // - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%4 == 0) { - sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%16 == 0) { - sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith == 0) { - for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; - dst[r1*ne0 + r0] = sum[0]; + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } + } } } @@ -440,65 +463,83 @@ kernel void kernel_mul_mat_q4_1_f32( constant int64_t & ne00, constant int64_t & ne10, constant int64_t & ne0, - threadgroup float * sum [[threadgroup(0)]], + constant int64_t & ne01[[buffer(4)]], uint2 tgpig[[threadgroup_position_in_grid]], - uint2 tpitg[[thread_position_in_threadgroup]], - uint2 tptg[[threads_per_threadgroup]]) { - const int nb = ne00/QK4_1; - - const int64_t r0 = tgpig.x; - const int64_t r1 = tgpig.y; - - device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb; + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + const int nb = ne00/QK4_0; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb; device const float * y = (device const float *) src1 + r1*ne10; + block_q4_1 qb_curr, qb_next; + float4 y_curr[8]; // src1 vector cache + float sumf[N_DST]={0.f}, all_sum; + thread float * yl=(thread float *)y_curr; + + // bootstrap + qb_curr = x[tiisg]; + // each thread in a SIMD group deals with 1 block. + for (int column = 0; column < nb / N_SIMDWIDTH; column++) { + + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; + } - const uint nth = tptg.x*tptg.y; - const uint ith = tptg.y*tpitg.x + tpitg.y; - - const int ix = tpitg.y/4; // 0 or 1 - const int iy = tpitg.y - 4*ix; // 0...3 - - const int first = 4 * iy; - - float sumf = 0; - - for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) { - - const float d = (float)x[i].d; - const float m = (float)x[i].m; - - device const uint8_t * xl = x[i].qs + first; - device const float * yl = y + i * QK4_1 + first; - - float2 acc = {0.0f, 0.0f}; + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - for (int j = 0; j < 4; ++j) { + // calculate + const float d = qb_curr.d; + const float m = qb_curr.m; + float acc = 0.f; + for (int i = 0; i < 16; i++) { + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + } + sumf[row] += d * acc + m * sumy; + qb_curr = qb_next; + } + } - acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m); - acc[1] += yl[j+16] * (d * (xl[j] >> 4) + m); + if (nb % N_SIMDWIDTH == 0) { + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } + } + } else { + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; } - sumf += acc[0] + acc[1]; - } + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - sum[ith] = sumf; + // calculate + const float d = qb_curr.d; + const float m = qb_curr.m; + float acc = 0.f; + for (int i = 0; i < 16; i++) { + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + } + if (tiisg < nb % N_SIMDWIDTH) { + sumf[row] += d * acc + m * sumy; + } + qb_curr = qb_next; - // - // Accumulate the sum from all threads in the threadgroup - // - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%4 == 0) { - sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%16 == 0) { - sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith == 0) { - for (uint i = 16; i < nth; i += 16) sum[0] += sum[i]; - dst[r1*ne0 + r0] = sum[0]; + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } + } } } @@ -615,17 +656,19 @@ kernel void kernel_rope( constant int & n_past, constant int & n_dims, constant int & mode, + constant float & freq_base, + constant float & freq_scale, uint3 tpig[[thread_position_in_grid]]) { const int64_t i3 = tpig[2]; const int64_t i2 = tpig[1]; const int64_t i1 = tpig[0]; const bool is_neox = mode & 2; - const float theta_scale = pow(10000.0, -2.0f/n_dims); + const float theta_scale = pow(freq_base, -2.0f/n_dims); const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); - float theta = (float)p; + float theta = freq_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) {