diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs index 392aec674d..20f2dd4418 100644 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs +++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs @@ -9,6 +9,7 @@ using TorchSharp; using Microsoft.ML.GenAI.Core; using Microsoft.ML.GenAI.Core.Extension; +using Microsoft.ML.Tokenizers; namespace Microsoft.ML.GenAI.Samples.Phi3Mini; @@ -26,12 +27,15 @@ public static async Task RunAsync() torch.manual_seed(1); torch.set_default_dtype(defaultType); var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct"; - var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device, quantizeToInt8: false); + var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model"); + var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath); + var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true); + var pipeline = new CausalLMPipeline(tokenizer, model, device); + var question = @"write a C# program to calculate the factorial of a number"; // agent var agent = new Phi3Agent(pipeline, "assistant") .RegisterPrintMessage(); - var question = @"write a C# program to calculate the factorial of a number"; // chat with the assistant await agent.SendAsync(question); diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs index a6f445b643..8ba882618b 100644 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs +++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs @@ -1,4 +1,7 @@ -using Microsoft.ML.GenAI.Phi.Extension; +using Microsoft.ML.GenAI.Core; +using Microsoft.ML.GenAI.Phi; +using Microsoft.ML.GenAI.Phi.Extension; +using Microsoft.ML.Tokenizers; using Microsoft.SemanticKernel; using Microsoft.SemanticKernel.ChatCompletion; using TorchSharp; @@ -20,8 +23,10 @@ public static async Task RunChatCompletionSample() torch.manual_seed(1); torch.set_default_dtype(defaultType); var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct"; - var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device); - + var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model"); + var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath); + var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true); + var pipeline = new CausalLMPipeline(tokenizer, model, device); var kernel = Kernel.CreateBuilder() .AddGenAIChatCompletion(pipeline) @@ -49,8 +54,10 @@ public static async Task RunTextGenerationSample() torch.manual_seed(1); torch.set_default_dtype(defaultType); var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct"; - var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device); - + var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model"); + var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath); + var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true); + var pipeline = new CausalLMPipeline(tokenizer, model, device); var kernel = Kernel.CreateBuilder() .AddGenAITextGeneration(pipeline) diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs deleted file mode 100644 index 33819a8df4..0000000000 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs +++ /dev/null @@ -1,103 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using Microsoft.ML.GenAI.Core; -using Microsoft.ML.GenAI.Phi; -using Tensorboard; -using static TorchSharp.torch; -using TorchSharp; -using Microsoft.ML.GenAI.Core.Extension; -using System.Text.Json; -using Microsoft.ML.Tokenizers; - -namespace Microsoft.ML.GenAI.Samples.Phi3Mini; - -internal static class Utils -{ - public static ICausalLMPipeline LoadPhi3Mini4KFromFolder( - string weightFolder, - string configName = "config.json", - string device = "cuda", - int modelSizeOnCudaInGB = 55, - int modelSizeOnMemoryInGB = 64, - int modelSizeOnDiskInGB = 200, - bool quantizeToInt8 = false, - bool quantizeToInt4 = false) - { - Console.WriteLine("Loading Phi3 from huggingface model weight folder"); - torch.set_default_device("meta"); - var configPath = System.IO.Path.Combine(weightFolder, configName); - var config = JsonSerializer.Deserialize(System.IO.File.ReadAllText(configPath)) ?? throw new ArgumentNullException(nameof(configPath)); - var timer = System.Diagnostics.Stopwatch.StartNew(); - var model = new Phi3ForCasualLM(config); - var tokenzierPath = System.IO.Path.Combine(weightFolder, "tokenizer.model"); - var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenzierPath); - - if (quantizeToInt8) - { - model.ToInt8QuantizeModule(); - } - else if (quantizeToInt4) - { - model.ToInt4QuantizeModule(); - } - - var deviceSizeMap = new Dictionary - { - ["cuda"] = modelSizeOnCudaInGB * 1L * 1024 * 1024 * 1024, - ["cpu"] = modelSizeOnMemoryInGB * 1L * 1024 * 1024 * 1024, - ["disk"] = modelSizeOnDiskInGB * 1L * 1024 * 1024 * 1024, - }; - - var deviceMap = model.InferDeviceMapForEachLayer( - devices: ["cuda", "cpu", "disk"], - deviceSizeMapInByte: deviceSizeMap); - - var deviceMapJson = JsonSerializer.Serialize(deviceMap, new JsonSerializerOptions { WriteIndented = true }); - Console.WriteLine($"Device map:"); - Console.WriteLine(deviceMapJson); - - // load weight - torch.set_default_device("cpu"); - - Console.WriteLine("Start loading"); - timer = System.Diagnostics.Stopwatch.StartNew(); - model = new Phi3ForCasualLM(config); - timer.Stop(); - Console.WriteLine($"Phi3 model created in {timer.ElapsedMilliseconds / 1000} s"); - - timer = System.Diagnostics.Stopwatch.StartNew(); - model.LoadSafeTensors(weightFolder); - timer.Stop(); - Console.WriteLine($"Phi3 weight loaded in {timer.ElapsedMilliseconds / 1000} s"); - - if (quantizeToInt8 || quantizeToInt4) - { - timer = System.Diagnostics.Stopwatch.StartNew(); - Console.WriteLine("Start quantizing if needed"); - if (quantizeToInt8) - { - model.ToInt8QuantizeModule(); - } - else if (quantizeToInt4) - { - model.ToInt4QuantizeModule(); - } - Console.WriteLine("Quantizing done"); - timer.Stop(); - Console.WriteLine($"Quantizing done in {timer.ElapsedMilliseconds / 1000} s"); - } - - timer = System.Diagnostics.Stopwatch.StartNew(); - Console.WriteLine($"Start loading to device: {device}"); - model = model.ToDynamicLoadingModel(deviceMap, "cuda"); - timer.Stop(); - Console.WriteLine($"Phi3 loaded to device: {device} in {timer.ElapsedMilliseconds / 1000} s"); - var pipeline = new CausalLMPipeline(tokenizer, model, device); - torch.set_default_device(device); - - return pipeline; - } -} diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs index 1560bad306..5e4355e595 100644 --- a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs @@ -1,4 +1,4 @@ // See https://aka.ms/new-console-template for more information using Microsoft.ML.GenAI.Samples.Phi3Mini; -await SemanticKernelSample.RunChatCompletionSample(); +await AutoGenSample.RunAsync(); diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs index 7ecb64f761..33e0bab19c 100644 --- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs +++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs @@ -32,6 +32,11 @@ string Generate( float topP = CausalLMPipeline.Defaults.TopP, string[]? stopSequences = CausalLMPipeline.Defaults.StopSequence); + /// + /// Generate the embedding(last hidden state of the last token) for the prompt. The embedding is normalized by L2 norm. + /// + float[] GenerateEmbeddingFromLastTokenPool(string prompt); + IEnumerable GenerateStreaming( string prompt, int maxLen = CausalLMPipeline.Defaults.MaxLen, @@ -281,4 +286,23 @@ protected torch.Tensor SampleTopP(torch.Tensor logits, float topP) nextToken = torch.gather(probsIndex, dim: -1, index: nextToken); return nextToken; } + + public float[] GenerateEmbeddingFromLastTokenPool(string prompt) + { + using var scope = NewDisposeScope(); + using var noGrad = torch.no_grad(); + var inputIds = this.Tokenizer.EncodeToIds(prompt); + var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: this.Device).unsqueeze(0); + var attentionMask = torch.ones_like(inputTensor, device: this.Device); + var input = new CausalLMModelInput(inputTensor, attentionMask, pastKeyValuesLength: 0); + var output = this.Model.forward(input); + var lastTokenHiddenState = output.LastHiddenState[0, ^1]; + + // shape of lastTokenHiddenState: [hidden_size] + // L2 norm + var norm = lastTokenHiddenState.norm(); + var normalized = lastTokenHiddenState / norm; + + return normalized.to_type(ScalarType.Float32).data().ToArray(); + } } diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs index c67741377e..a5840b242a 100644 --- a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs +++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs @@ -9,6 +9,7 @@ using System.Text.Json; using System.Threading.Tasks; using Microsoft.ML.GenAI.Core; +using Microsoft.ML.GenAI.Core.Extension; using Microsoft.ML.GenAI.Phi.Module; using TorchSharp; using TorchSharp.Modules; @@ -66,6 +67,55 @@ public static Phi3ForCasualLM FromPretrained( return phi; } + public static Phi3ForCasualLM FromPretrained( + string modelFolder, + string configName = "config.json", + string checkPointName = "model.safetensors.index.json", + bool quantizeToInt8 = false, + bool quantizeToInt4 = false, + int layersOnTargetDevice = -1, + ScalarType torchDtype = ScalarType.BFloat16, + string targetDevice = "cuda") + { + if (layersOnTargetDevice == -1 && quantizeToInt4 == false && quantizeToInt8 == false) + { + return FromPretrained(modelFolder, configName, checkPointName, torchDtype, targetDevice); + } + + var originalDefaultDevice = torch.get_default_device(); + torch.set_default_device("meta"); + var config = Path.Join(modelFolder, configName); + var modelConfig = JsonSerializer.Deserialize(File.ReadAllText(config)) ?? throw new ArgumentNullException(nameof(config)); + modelConfig.DType = torchDtype; + var model = new Phi3ForCasualLM(modelConfig); + + if (quantizeToInt8) + { + model.ToInt8QuantizeModule(); + } + else if (quantizeToInt4) + { + model.ToInt4QuantizeModule(); + } + + var deviceMap = model.InferDeviceMapForEachLayer( + [ + KeyValuePair.Create(targetDevice, layersOnTargetDevice), + KeyValuePair.Create("cpu", -1) + ]); + + torch.set_default_device("cpu"); + model = new Phi3ForCasualLM(modelConfig); + + model.LoadSafeTensors(modelFolder, checkPointName); + + model = model.ToDynamicLoadingModel(deviceMap, targetDevice); + + torch.set_default_device(originalDefaultDevice); + + return model; + } + public void LoadSafeTensors(string modelFolder, string checkPointName = "model.safetensors.index.json") { this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: false, useTqdm: false);