diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln
index 1fa8823763..d3985d1777 100644
--- a/Microsoft.ML.sln
+++ b/Microsoft.ML.sln
@@ -176,7 +176,15 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TorchSharp.Tes
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TensorFlow.Tests", "test\Microsoft.ML.TensorFlow.Tests\Microsoft.ML.TensorFlow.Tests.csproj", "{763FF013-8309-4680-A769-B54E7BB99612}"
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Core", "src\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj", "{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.GenAI.Core", "src\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj", "{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.GenAI.Phi", "src\Microsoft.ML.GenAI.Phi\Microsoft.ML.GenAI.Phi.csproj", "{694BF884-B2E4-4E1C-9342-0564BAAC4575}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.GenAI.Phi.Tests", "test\Microsoft.ML.GenAI.Phi.Tests\Microsoft.ML.GenAI.Phi.Tests.csproj", "{867FFC34-DFA7-400F-B9BB-85158326CE08}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.GenAI.Samples", "docs\samples\Microsoft.ML.GenAI.Samples\Microsoft.ML.GenAI.Samples.csproj", "{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Core.Tests", "test\Microsoft.ML.GenAI.Core.Tests\Microsoft.ML.GenAI.Core.Tests.csproj", "{14AB0804-D4CE-4634-B544-5A8587620783}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -838,6 +846,38 @@ Global
 		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Release|Any CPU.Build.0 = Release|Any CPU
 		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Release|x64.ActiveCfg = Release|Any CPU
 		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85}.Release|x64.Build.0 = Release|Any CPU
+		{694BF884-B2E4-4E1C-9342-0564BAAC4575}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{694BF884-B2E4-4E1C-9342-0564BAAC4575}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{694BF884-B2E4-4E1C-9342-0564BAAC4575}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{694BF884-B2E4-4E1C-9342-0564BAAC4575}.Debug|x64.Build.0 = Debug|Any CPU
+		{694BF884-B2E4-4E1C-9342-0564BAAC4575}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{694BF884-B2E4-4E1C-9342-0564BAAC4575}.Release|Any CPU.Build.0 = Release|Any CPU
+		{694BF884-B2E4-4E1C-9342-0564BAAC4575}.Release|x64.ActiveCfg = Release|Any CPU
+		{694BF884-B2E4-4E1C-9342-0564BAAC4575}.Release|x64.Build.0 = Release|Any CPU
+		{867FFC34-DFA7-400F-B9BB-85158326CE08}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{867FFC34-DFA7-400F-B9BB-85158326CE08}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{867FFC34-DFA7-400F-B9BB-85158326CE08}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{867FFC34-DFA7-400F-B9BB-85158326CE08}.Debug|x64.Build.0 = Debug|Any CPU
+		{867FFC34-DFA7-400F-B9BB-85158326CE08}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{867FFC34-DFA7-400F-B9BB-85158326CE08}.Release|Any CPU.Build.0 = Release|Any CPU
+		{867FFC34-DFA7-400F-B9BB-85158326CE08}.Release|x64.ActiveCfg = Release|Any CPU
+		{867FFC34-DFA7-400F-B9BB-85158326CE08}.Release|x64.Build.0 = Release|Any CPU
+		{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47}.Debug|x64.Build.0 = Debug|Any CPU
+		{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47}.Release|Any CPU.Build.0 = Release|Any CPU
+		{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47}.Release|x64.ActiveCfg = Release|Any CPU
+		{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47}.Release|x64.Build.0 = Release|Any CPU
+		{14AB0804-D4CE-4634-B544-5A8587620783}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{14AB0804-D4CE-4634-B544-5A8587620783}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{14AB0804-D4CE-4634-B544-5A8587620783}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{14AB0804-D4CE-4634-B544-5A8587620783}.Debug|x64.Build.0 = Debug|Any CPU
+		{14AB0804-D4CE-4634-B544-5A8587620783}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{14AB0804-D4CE-4634-B544-5A8587620783}.Release|Any CPU.Build.0 = Release|Any CPU
+		{14AB0804-D4CE-4634-B544-5A8587620783}.Release|x64.ActiveCfg = Release|Any CPU
+		{14AB0804-D4CE-4634-B544-5A8587620783}.Release|x64.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -925,6 +965,10 @@ Global
 		{AB8D68F1-6C3E-41FD-B0EC-A093E009341D} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{763FF013-8309-4680-A769-B54E7BB99612} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{DB2CA055-8ABD-4E3E-8089-5B64C3415E85} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{694BF884-B2E4-4E1C-9342-0564BAAC4575} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{867FFC34-DFA7-400F-B9BB-85158326CE08} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
+		{1D4AD9A3-19AF-432B-889D-A63FE6D7BD47} = {DA452A53-2E94-4433-B08C-041EDEC729E6}
+		{14AB0804-D4CE-4634-B544-5A8587620783} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT
index 650a7bd53d..0a86247f4e 100644
--- a/THIRD-PARTY-NOTICES.TXT
+++ b/THIRD-PARTY-NOTICES.TXT
@@ -171,3 +171,29 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+License notice for Torchsharp.PyBridge
+------------------------------------------
+https://github.com/shaltielshmid/TorchSharp.PyBridge/blob/main/LICENSE
+
+MIT License
+
+Copyright (c) 2023 shaltielshmid
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/build/ci/job-template.yml b/build/ci/job-template.yml
index 8a78ea548b..3270a42615 100644
--- a/build/ci/job-template.yml
+++ b/build/ci/job-template.yml
@@ -121,7 +121,7 @@ jobs:
     - ${{ if eq(parameters.nightlyBuild, 'false') }}:
       - ${{ if eq(parameters.innerLoop, 'false') }}:
         - ${{ if and(eq(parameters.runSpecific, 'false'), eq(parameters.useVSTestTask, 'false')) }}:
-          - script: set PATH=%PATH%;%USERPROFILE%\.nuget\packages\libtorch-cpu-win-x64\2.2.0.1\runtimes\win-x64\native;%USERPROFILE%\.nuget\packages\torchsharp\0.102.5\runtimes\win-x64\native & ${{ parameters.buildScript }} /p:Build=false -configuration $(_configuration) /p:TargetArchitecture=${{ parameters.architecture }} /p:TestArchitectures=${{ parameters.architecture }} -test -integrationTest /p:Coverage=${{ parameters.codeCoverage }} $(testTargetFramework)
+          - script: set PATH=%PATH%;%USERPROFILE%\.nuget\packages\libtorch-cpu-win-x64\2.2.1.1\runtimes\win-x64\native;%USERPROFILE%\.nuget\packages\torchsharp\0.102.7\runtimes\win-x64\native & ${{ parameters.buildScript }} /p:Build=false -configuration $(_configuration) /p:TargetArchitecture=${{ parameters.architecture }} /p:TestArchitectures=${{ parameters.architecture }} -test -integrationTest /p:Coverage=${{ parameters.codeCoverage }} $(testTargetFramework)
             displayName: Run All Tests.
         - ${{ if and(eq(parameters.runSpecific, 'true'), eq(parameters.useVSTestTask, 'false')) }}:
           - script: ${{ parameters.buildScript }} /p:Build=false -configuration $(_configuration) /p:TargetArchitecture=${{ parameters.architecture }} /p:TestArchitectures=${{ parameters.architecture }} -test -integrationTest /p:TestRunnerAdditionalArguments='-trait$(spaceValue)Category=RunSpecificTest' /p:Coverage=${{ parameters.codeCoverage }} $(testTargetFramework)
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Microsoft.ML.GenAI.Samples.csproj b/docs/samples/Microsoft.ML.GenAI.Samples/Microsoft.ML.GenAI.Samples.csproj
new file mode 100644
index 0000000000..0331a32fc1
--- /dev/null
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Microsoft.ML.GenAI.Samples.csproj
@@ -0,0 +1,20 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net8.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\..\src\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" />
+    <ProjectReference Include="..\..\..\src\Microsoft.ML.GenAI.Phi\Microsoft.ML.GenAI.Phi.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <PackageReference Include="TorchSharp-cuda-windows" Version="0.102.5" Condition="$([MSBuild]::IsOSPlatform('Windows'))" />
+    <PackageReference Include="Microsoft.SemanticKernel" Version="$(SemanticKernelVersion)" />
+  </ItemGroup>
+
+</Project>
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
new file mode 100644
index 0000000000..379fd2b97b
--- /dev/null
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
@@ -0,0 +1,39 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using AutoGen.Core;
+using Microsoft.ML.GenAI.Phi;
+using static TorchSharp.torch;
+using TorchSharp;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
+
+namespace Microsoft.ML.GenAI.Samples.Phi3Mini;
+
+public class AutoGenSample
+{
+    public static async Task RunAsync()
+    {
+        var device = "cuda";
+        if (device == "cuda")
+        {
+            torch.InitializeDeviceType(DeviceType.CUDA);
+        }
+
+        var defaultType = ScalarType.Float16;
+        torch.manual_seed(1);
+        torch.set_default_dtype(defaultType);
+        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
+        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device);
+
+        // agent
+        var agent = new Phi3Agent(pipeline, "assistant")
+            .RegisterPrintMessage();
+        var question = @"write a C# program to calculate the factorial of a number";
+
+        // chat with the assistant
+        await agent.SendAsync(question);
+    }
+}
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs
new file mode 100644
index 0000000000..a6f445b643
--- /dev/null
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs
@@ -0,0 +1,62 @@
+﻿using Microsoft.ML.GenAI.Phi.Extension;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.ChatCompletion;
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Samples.Phi3Mini;
+
+public class SemanticKernelSample
+{
+    public static async Task RunChatCompletionSample()
+    {
+        var device = "cuda";
+        if (device == "cuda")
+        {
+            torch.InitializeDeviceType(DeviceType.CUDA);
+        }
+
+        var defaultType = ScalarType.Float16;
+        torch.manual_seed(1);
+        torch.set_default_dtype(defaultType);
+        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
+        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device);
+
+
+        var kernel = Kernel.CreateBuilder()
+            .AddGenAIChatCompletion(pipeline)
+            .Build();
+        var chatService = kernel.GetRequiredService<IChatCompletionService>();
+        var chatHistory = new ChatHistory();
+        chatHistory.AddSystemMessage("you are a helpful assistant");
+        chatHistory.AddUserMessage("write a C# program to calculate the factorial of a number");
+
+        await foreach (var response in chatService.GetStreamingChatMessageContentsAsync(chatHistory))
+        {
+            Console.Write(response);
+        }
+    }
+
+    public static async Task RunTextGenerationSample()
+    {
+        var device = "cuda";
+        if (device == "cuda")
+        {
+            torch.InitializeDeviceType(DeviceType.CUDA);
+        }
+
+        var defaultType = ScalarType.Float16;
+        torch.manual_seed(1);
+        torch.set_default_dtype(defaultType);
+        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
+        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device);
+
+
+        var kernel = Kernel.CreateBuilder()
+            .AddGenAITextGeneration(pipeline)
+            .Build();
+
+        var response = await kernel.InvokePromptAsync("Tell a joke");
+        Console.WriteLine(response);
+    }
+}
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs
new file mode 100644
index 0000000000..5e53ef0ac4
--- /dev/null
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs
@@ -0,0 +1,103 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Phi;
+using Tensorboard;
+using static TorchSharp.torch;
+using TorchSharp;
+using Microsoft.ML.GenAI.Core.Extension;
+using System.Text.Json;
+using Microsoft.ML.Tokenizers;
+
+namespace Microsoft.ML.GenAI.Samples.Phi3Mini;
+
+internal static class Utils
+{
+    public static ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> LoadPhi3Mini4KFromFolder(
+        string weightFolder,
+        string configName = "config.json",
+        string device = "cuda",
+        int modelSizeOnCudaInGB = 16,
+        int modelSizeOnMemoryInGB = 64,
+        int modelSizeOnDiskInGB = 200,
+        bool quantizeToInt8 = false,
+        bool quantizeToInt4 = false)
+    {
+        Console.WriteLine("Loading Phi3 from huggingface model weight folder");
+        torch.set_default_device("meta");
+        var configPath = System.IO.Path.Combine(weightFolder, configName);
+        var config = JsonSerializer.Deserialize<Phi3Config>(System.IO.File.ReadAllText(configPath)) ?? throw new ArgumentNullException(nameof(configPath));
+        var timer = System.Diagnostics.Stopwatch.StartNew();
+        var model = new Phi3ForCasualLM(config);
+        var tokenzierPath = System.IO.Path.Combine(weightFolder, "tokenizer.model");
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenzierPath);
+
+        if (quantizeToInt8)
+        {
+            model.ToInt8QuantizeModule();
+        }
+        else if (quantizeToInt4)
+        {
+            model.ToInt4QuantizeModule();
+        }
+
+        var deviceSizeMap = new Dictionary<string, long>
+        {
+            ["cuda"] = modelSizeOnCudaInGB * 1L * 1024 * 1024 * 1024,
+            ["cpu"] = modelSizeOnMemoryInGB * 1L * 1024 * 1024 * 1024,
+            ["disk"] = modelSizeOnDiskInGB * 1L * 1024 * 1024 * 1024,
+        };
+
+        var deviceMap = model.InferDeviceMapForEachLayer(
+            devices: ["cuda", "cpu", "disk"],
+            deviceSizeMapInByte: deviceSizeMap);
+
+        var deviceMapJson = JsonSerializer.Serialize(deviceMap, new JsonSerializerOptions { WriteIndented = true });
+        Console.WriteLine($"Device map:");
+        Console.WriteLine(deviceMapJson);
+
+        // load weight
+        torch.set_default_device("cpu");
+
+        Console.WriteLine("Start loading");
+        timer = System.Diagnostics.Stopwatch.StartNew();
+        model = new Phi3ForCasualLM(config);
+        timer.Stop();
+        Console.WriteLine($"Phi3 model created in {timer.ElapsedMilliseconds / 1000} s");
+
+        timer = System.Diagnostics.Stopwatch.StartNew();
+        model.LoadSafeTensors(weightFolder);
+        timer.Stop();
+        Console.WriteLine($"Phi3 weight loaded in {timer.ElapsedMilliseconds / 1000} s");
+
+        if (quantizeToInt8 || quantizeToInt4)
+        {
+            timer = System.Diagnostics.Stopwatch.StartNew();
+            Console.WriteLine("Start quantizing if needed");
+            if (quantizeToInt8)
+            {
+                model.ToInt8QuantizeModule();
+            }
+            else if (quantizeToInt4)
+            {
+                model.ToInt4QuantizeModule();
+            }
+            Console.WriteLine("Quantizing done");
+            timer.Stop();
+            Console.WriteLine($"Quantizing done in {timer.ElapsedMilliseconds / 1000} s");
+        }
+
+        timer = System.Diagnostics.Stopwatch.StartNew();
+        Console.WriteLine($"Start loading to device: {device}");
+        model = model.ToDynamicLoadingModel(deviceMap, "cuda");
+        timer.Stop();
+        Console.WriteLine($"Phi3 loaded to device: {device} in {timer.ElapsedMilliseconds / 1000} s");
+        var pipeline = new CausalLMPipeline<Tokenizer, Phi3ForCasualLM>(tokenizer, model, device);
+        torch.set_default_device(device);
+
+        return pipeline;
+    }
+}
diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
new file mode 100644
index 0000000000..1560bad306
--- /dev/null
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
@@ -0,0 +1,4 @@
+﻿// See https://aka.ms/new-console-template for more information
+using Microsoft.ML.GenAI.Samples.Phi3Mini;
+
+await SemanticKernelSample.RunChatCompletionSample();
diff --git a/eng/Versions.props b/eng/Versions.props
index 6b88b58d6b..c7636793dd 100644
--- a/eng/Versions.props
+++ b/eng/Versions.props
@@ -63,18 +63,22 @@
     <TensorflowDotNETVersion>0.20.1</TensorflowDotNETVersion>
     <TensorFlowMajorVersion>2</TensorFlowMajorVersion>
     <TensorFlowVersion>2.3.1</TensorFlowVersion>
+    <TorchSharpPyBridgeVersion>1.4.1</TorchSharpPyBridgeVersion>
+    <AutoGenVersion>0.0.15</AutoGenVersion>
+    <SemanticKernelVersion>1.15.0</SemanticKernelVersion>
     <TorchSharpVersion>0.102.7</TorchSharpVersion>
     <LibTorchVersion>2.2.1.1</LibTorchVersion>
     <!-- Build/infrastructure Dependencies -->
     <CodecovVersion>1.12.4</CodecovVersion>
-    <CoverletCollectorVersion>3.1.2</CoverletCollectorVersion>
-    <CoverletMsbuildVersion>3.1.2</CoverletMsbuildVersion>
+    <CoverletCollectorVersion>6.0.2</CoverletCollectorVersion>
+    <CoverletMsbuildVersion>6.0.0</CoverletMsbuildVersion>
     <MicrosoftCodeAnalysisCSharpInternalAnalyzerVersion>3.3.1</MicrosoftCodeAnalysisCSharpInternalAnalyzerVersion>
     <MicrosoftCSharpVersion>4.5.0</MicrosoftCSharpVersion>
     <ReportGeneratorVersion>4.3.6</ReportGeneratorVersion>
     <SystemCompositionVersion>1.2.0</SystemCompositionVersion>
     <!-- Test-only Dependencies -->
     <ApprovalTestsVersion>5.4.7</ApprovalTestsVersion>
+    <MoqVersion>4.20.70</MoqVersion>
     <BenchmarkDotNetVersion>0.13.1</BenchmarkDotNetVersion>
     <DotNetRuntime60Version>6.0.26</DotNetRuntime60Version>
     <DotNetRuntime80Version>8.0.1</DotNetRuntime80Version>
diff --git a/src/Microsoft.ML.GenAI.Core/Extension/CausalLMPipelineExtension.cs b/src/Microsoft.ML.GenAI.Core/Extension/CausalLMPipelineExtension.cs
deleted file mode 100644
index 3a1041ee8a..0000000000
--- a/src/Microsoft.ML.GenAI.Core/Extension/CausalLMPipelineExtension.cs
+++ /dev/null
@@ -1,50 +0,0 @@
-﻿// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-using static TorchSharp.torch;
-using TorchSharp;
-
-namespace Microsoft.ML.GenAI.Core.Extension;
-
-public static class CausalLMPipelineExtension
-{
-    public static string? Generate(
-        this CausalLMPipeline pipeline,
-        string prompt,
-        int maxLen = 128,
-        float temperature = 0.7f,
-        float topP = 0.9f,
-        string[]? stopSequences = null,
-        int eosId = 0,
-        string device = "cpu",
-        bool bos = true,
-        bool eos = false,
-        bool echo = false)
-    {
-        using var newScope = NewDisposeScope();
-        var inputIds = pipeline.Tokenizer.EncodeToIds(prompt);
-        var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: device).unsqueeze(0);
-        var attentionMask = torch.ones_like(inputTensor);
-
-        // set up stop token ids
-        // stop token ids: [[eosId], [stopSequence1], [stopSequence2], ...]
-        // when causal language model generates tokens, it will stop when it generates any token in stopSequences
-        List<int[]> stopTokenIds = [[eosId]];
-        if (stopSequences != null)
-        {
-            stopTokenIds.AddRange(stopSequences.Select(x => pipeline.Tokenizer.EncodeToIds(x).ToArray()));
-        }
-
-        (var token, var _) = pipeline.Generate(inputTensor, attentionMask, temperature: temperature, maxLen: maxLen, topP: topP, stopTokenSequence: stopTokenIds.ToArray(), echo: echo);
-
-        var tokenIds = token[0].to_type(ScalarType.Int32).data<int>().ToArray();
-
-        return pipeline.Tokenizer.Decode(tokenIds);
-    }
-}
diff --git a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
index 6395ffd3fd..18633728a5 100644
--- a/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
+++ b/src/Microsoft.ML.GenAI.Core/Extension/ModuleExtension.cs
@@ -4,17 +4,14 @@
 
 using System;
 using System.Collections.Generic;
-using System.IO;
 using System.Linq;
 using System.Text;
-using Microsoft.ML.GenAI.Core;
-using Microsoft.ML.GenAI.Core.Extension;
 using TorchSharp;
 using static TorchSharp.torch;
 
 namespace Microsoft.ML.GenAI.Core.Extension;
 
-internal static class ModuleExtension
+public static class ModuleExtension
 {
     public static long GetSizeInBytes(this nn.Module model)
     {
@@ -59,19 +56,60 @@ public static Dictionary<string, long> GetSizeForEachDynamicLayerInBytes(this nn
         }
     }
 
-    public static void ToQuantizedModule<T>(
+    /// <summary>
+    /// Quantize the module using zero-point int8 quantization.
+    /// </summary>
+    /// <typeparam name="T"></typeparam>
+    /// <param name="model"></param>
+    public static void ToInt8QuantizeModule<T>(
         this T model)
         where T : nn.Module
     {
+        if (model is IQuantizeModule quantized)
+        {
+            quantized.Int8();
+
+            return;
+        }
+
         foreach (var (_, value) in model.named_children())
         {
             if (value is IQuantizeModule quantizeModule)
             {
-                quantizeModule.Quantize();
+                quantizeModule.Int8();
             }
             else
             {
-                value.ToQuantizedModule();
+                value.ToInt8QuantizeModule();
+            }
+        }
+    }
+
+    /// <summary>
+    /// Quantize the module using zero-point int4 quantization.
+    /// </summary>
+    /// <typeparam name="T"></typeparam>
+    /// <param name="model"></param>
+    public static void ToInt4QuantizeModule<T>(
+        this T model)
+        where T : nn.Module
+    {
+        if (model is IQuantizeModule quantized)
+        {
+            quantized.Int4();
+
+            return;
+        }
+
+        foreach (var (_, value) in model.named_children())
+        {
+            if (value is IQuantizeModule quantizeModule)
+            {
+                quantizeModule.Int4();
+            }
+            else
+            {
+                value.ToInt4QuantizeModule();
             }
         }
     }
@@ -159,7 +197,7 @@ public static Dictionary<string, string> InferDeviceMapForEachLayer(
         return deviceMap;
     }
 
-    public static string Peek(this nn.Module model)
+    internal static string Peek(this nn.Module model)
     {
         var sb = new StringBuilder();
         var stateDict = model.state_dict();
@@ -177,7 +215,7 @@ public static string Peek(this nn.Module model)
         return res;
     }
 
-    public static string PeekShape(this nn.Module model)
+    internal static string PeekShape(this nn.Module model)
     {
         var sb = new StringBuilder();
         var stateDict = model.state_dict();
@@ -195,47 +233,4 @@ public static string PeekShape(this nn.Module model)
 
         return res;
     }
-
-    public static void LoadStateDict(this Dictionary<string, Tensor> dict, string location)
-    {
-        using FileStream stream = File.OpenRead(location);
-        using BinaryReader reader = new BinaryReader(stream);
-        var num = reader.Decode();
-        for (int i = 0; i < num; i++)
-        {
-            var key = reader.ReadString();
-            Tensor tensor = dict[key];
-
-            var originalDevice = tensor.device;
-            var originalType = tensor.dtype;
-            if (tensor.dtype == ScalarType.BFloat16)
-            {
-                tensor = tensor.to_type(ScalarType.Float32);
-            }
-
-            TensorExtensionMethods.Load(ref tensor!, reader, skip: false);
-
-            tensor = tensor!.to_type(originalType);
-            dict[key] = tensor;
-        }
-    }
-
-    public static long Decode(this BinaryReader reader)
-    {
-        long num = 0L;
-        int num2 = 0;
-        while (true)
-        {
-            long num3 = reader.ReadByte();
-            num += (num3 & 0x7F) << num2 * 7;
-            if ((num3 & 0x80) == 0L)
-            {
-                break;
-            }
-
-            num2++;
-        }
-
-        return num;
-    }
 }
diff --git a/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
index b9f91cc943..dfb64082fb 100644
--- a/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
+++ b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
@@ -1,7 +1,7 @@
 ﻿<Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
-    <TargetFramework>net8.0</TargetFramework>
+    <TargetFrameworks>net6.0;net8.0</TargetFrameworks>
     <IsPackable>false</IsPackable>
     <Nullable>enable</Nullable>
     <LangVersion>preview</LangVersion>
@@ -11,9 +11,22 @@
     <PackageReference Include="System.Memory" Version="$(SystemMemoryVersion)" />
     <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
   </ItemGroup>
+<!-- 
+  
+  <ItemGroup Condition="'$(Configuration)' == 'Debug'">
+    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows'))" PrivateAssets="all" />
+    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux'))" PrivateAssets="all" />
+    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX'))" PrivateAssets="all" />
+  </ItemGroup> -->
 
   <ItemGroup>
     <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
   </ItemGroup>
 
+  <ItemGroup>
+    <InternalsVisibleTo Include="Microsoft.ML.GenAI.Phi" />
+    <InternalsVisibleTo Include="Microsoft.ML.GenAI.Phi.Tests" />
+    <InternalsVisibleTo Include="Microsoft.ML.GenAI.Core.Tests" />
+  </ItemGroup>
+
 </Project>
diff --git a/src/Microsoft.ML.GenAI.Core/Module/GenAILinear.cs b/src/Microsoft.ML.GenAI.Core/Module/GenAILinear.cs
new file mode 100644
index 0000000000..77bcadeb82
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Module/GenAILinear.cs
@@ -0,0 +1,52 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI;
+internal class GenAILinear : nn.Module<Tensor, Tensor>
+{
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    protected Tensor? weight;
+    protected Tensor? bias;
+    protected readonly int _inFeatures;
+    protected readonly int _outFeatures;
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+
+    public GenAILinear(int inFeatures, int outFeatures, bool hasBias = true, ScalarType dtype = ScalarType.Float32, string? device = null)
+        : base(nameof(GenAILinear))
+    {
+        this._inFeatures = inFeatures;
+        this._outFeatures = outFeatures;
+        device ??= torch.get_default_device().ToString();
+        this.weight = torch.zeros(outFeatures, inFeatures, dtype: dtype, device: device);
+
+        if (hasBias)
+        {
+            this.bias = torch.zeros(outFeatures, dtype: dtype, device: device);
+        }
+
+        base.RegisterComponents();
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override Tensor forward(Tensor input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        using var dispose = torch.NewDisposeScope();
+
+        // use float32
+        var input2 = input.to_type(ScalarType.Float32);
+        var weight2 = this.weight!.to_type(ScalarType.Float32);
+        var result = torch.matmul(input2, weight2.t());
+
+        if (this.bias is not null)
+        {
+            result = result + this.bias.to_type(ScalarType.Float32);
+        }
+
+        return result.to_type(input.dtype).MoveToOuterDisposeScope();
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs b/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs
index 164936f3d7..57c0b7620f 100644
--- a/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs
+++ b/src/Microsoft.ML.GenAI.Core/Module/IQuantizeModule.cs
@@ -6,5 +6,7 @@ namespace Microsoft.ML.GenAI.Core;
 
 public interface IQuantizeModule
 {
-    public void Quantize();
+    public void Int8();
+
+    public void Int4();
 }
diff --git a/src/Microsoft.ML.GenAI.Core/Module/NewGELUActivation.cs b/src/Microsoft.ML.GenAI.Core/Module/NewGELUActivation.cs
new file mode 100644
index 0000000000..4c46e53104
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Module/NewGELUActivation.cs
@@ -0,0 +1,28 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI;
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+internal class NewGELUActivation : torch.nn.Module<Tensor, Tensor>
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+{
+    public NewGELUActivation()
+        : base(nameof(NewGELUActivation))
+    {
+    }
+
+    public override Tensor forward(Tensor input)
+    {
+        using var result = 0.044715 * torch.pow(input, 3.0);
+        using var result2 = result + input;
+        using var result3 = Math.Sqrt(2.0 / Math.PI) * result2;
+        using var result4 = torch.tanh(result3);
+        using var result5 = 1.0 + result4;
+        return 0.5 * input * result5;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Module/QuantizedLinear.cs b/src/Microsoft.ML.GenAI.Core/Module/QuantizedLinear.cs
new file mode 100644
index 0000000000..268ac0a4a4
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Core/Module/QuantizedLinear.cs
@@ -0,0 +1,208 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+using System;
+using Microsoft.ML.GenAI.Core;
+using TorchSharp;
+using static TorchSharp.torch;
+namespace Microsoft.ML.GenAI;
+
+internal class QuantizedLinear : GenAILinear, IQuantizeModule
+{
+    public QuantizedLinear(int inFeatures, int outFeatures, bool hasBias = true, ScalarType dtype = ScalarType.Float32, string? device = null)
+        : base(inFeatures, outFeatures, hasBias, dtype, device)
+    {
+    }
+
+    public void Int8()
+    {
+        if (this.weight is null)
+        {
+            throw new Exception("Weight is not initialized");
+        }
+
+        if (this.weight.device_type != DeviceType.META)
+        {
+            // if weight is not on meta device, this means that weight and bias are already loaded
+            // so we can quantize them in memory
+
+            var timer = new System.Diagnostics.Stopwatch();
+            timer.Start();
+            // scale and zero point on vector-wise
+            // scale = 255 / max(weight, axis=1) - min(weight, axis=1)
+            var scale = 255 / (torch.max(this.weight, 1).values - torch.min(this.weight, 1).values);
+
+            // zero point = - scale * min(weight, axis=1) - 128
+            var zeroPoint = -scale * torch.min(this.weight, 1).values - 128;
+            // round zero point to nearest integer
+            zeroPoint = torch.round(zeroPoint).to(torch.int8);
+
+            // assert zero point is in range [-128, 127]
+            //if (torch.any(this.zeroPoint < -128).item<bool>() || torch.any(this.zeroPoint > 127).item<bool>())
+            //{
+            //    throw new Exception("Zero point is out of range [-128, 127]");
+            //}
+
+            // quantize weight
+            var eightBitWeight = torch.round(this.weight * scale.view(-1, 1) + zeroPoint.view(-1, 1)).to(torch.int8);
+
+            // assert weight is in range [-128, 127]
+            //if (torch.any(this._8bitWeight < -128).item<bool>() || torch.any(this._8bitWeight > 127).item<bool>())
+            //{
+            //    throw new Exception("Weight is out of range [-128, 127]");
+            //}
+            timer.Stop();
+            // dispose float32 weight
+            this.weight.Dispose();
+            this.weight = null;
+            this._internal_buffers.Remove("weight");
+            this.register_buffer("8bit_weight", eightBitWeight);
+            this.register_buffer("zeroPoint", zeroPoint);
+            this.register_buffer("scale", scale);
+        }
+        else
+        {
+            // if weight is on meta device, then we just need to create the placeholder for 8bit_weight, zeroPoint and scale
+            var eightBitWeight = torch.zeros(this.weight.shape, dtype: torch.int8);
+            var zeroPoint = torch.zeros(this.weight.shape[0], dtype: torch.int8);
+            var scale = torch.zeros(this.weight.shape[0], dtype: torch.float32);
+
+            this._internal_buffers.Remove("weight");
+            this.weight = null;
+            this.register_buffer("8bit_weight", eightBitWeight);
+            this.register_buffer("zeroPoint", zeroPoint);
+            this.register_buffer("scale", scale);
+        }
+    }
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override Tensor forward(Tensor input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        if (this._internal_buffers.ContainsKey("weight"))
+        {
+            return base.forward(input);
+        }
+        else if (this._internal_buffers.ContainsKey("8bit_weight"))
+        {
+            // 8bit quantization
+            using var dispose = torch.NewDisposeScope();
+            var weight = this.get_buffer("8bit_weight").to(ScalarType.Float32);
+            var zeroPoint = this.get_buffer("zeroPoint").to(ScalarType.Float32);
+            var scale = this.get_buffer("scale").to(ScalarType.Float32);
+            var restoreWeight = (weight - zeroPoint.view(-1, 1)) / scale.view(-1, 1);
+            // use float32
+            var result = torch.matmul(input.to(ScalarType.Float32), restoreWeight.T);
+
+            if (this.bias is not null)
+            {
+                result = result + this.bias.to_type(ScalarType.Float32);
+            }
+
+            //result.Peek("result");
+            return result.to_type(input.dtype).MoveToOuterDisposeScope();
+        }
+        else if (this._internal_buffers.ContainsKey("4bit_weight"))
+        {
+            using var dispose = torch.NewDisposeScope();
+            var weight = this.get_buffer("4bit_weight");
+            var weightLower = weight % 16;
+            var weightUpper = weight / 16;
+            weight = torch.cat([weightUpper, weightLower], 0).to(ScalarType.Float32);
+            weight = weight.view(this._outFeatures, this._inFeatures);
+            weight -= 8;
+            var zeroPoint = this.get_buffer("zeroPoint");
+            var zeroPointLower = zeroPoint % 16;
+            var zeroPointUpper = zeroPoint / 16;
+            zeroPoint = torch.cat([zeroPointUpper, zeroPointLower], 0).to(ScalarType.Float32);
+            zeroPoint -= 8;
+            var scale = this.get_buffer("scale").to(ScalarType.Float32);
+            var restoreWeight = (weight - zeroPoint.view(-1, 1)) / scale.view(-1, 1);
+            // use float32
+            var result = torch.matmul(input.to(ScalarType.Float32), restoreWeight.T);
+
+            if (this.bias is not null)
+            {
+                result = result + this.bias.to_type(ScalarType.Float32);
+            }
+
+            //result.Peek("result");
+            return result.to_type(input.dtype).MoveToOuterDisposeScope();
+        }
+        else
+        {
+            throw new Exception("Quantization is not done yet");
+        }
+    }
+
+    public void Int4()
+    {
+        if (this.weight is null)
+        {
+            throw new Exception("Weight is not initialized");
+        }
+        var placeHolderDim = this._outFeatures / 2 + this._outFeatures % 2;
+        var fourBitWeightDim = this.weight.size(0) * this.weight.size(1);
+        var fourBitWeightPlaceHolderDim = Convert.ToInt32(fourBitWeightDim / 2 + fourBitWeightDim % 2);
+        if (this.weight.device_type != DeviceType.META)
+        {
+            using var scope = NewDisposeScope();
+            var timer = new System.Diagnostics.Stopwatch();
+            timer.Start();
+            // scale and zero point on vector-wise
+            // scale = 15 / max(weight, axis=1) - min(weight, axis=1)
+            var scale = 15 / (torch.max(this.weight, 1).values - torch.min(this.weight, 1).values);
+
+            // zero point = - scale * min(weight, axis=1) - 8
+            var zeroPoint = -scale * torch.min(this.weight, 1).values - 8;
+            // round zero point to nearest integer
+            zeroPoint = torch.round(zeroPoint);
+            var fourBitWeight = torch.round(this.weight * scale.view(-1, 1) + zeroPoint.view(-1, 1)).to(torch.int8);
+
+            zeroPoint = (zeroPoint + 8).to(torch.uint8);
+            fourBitWeight = (fourBitWeight + 8).view(-1).to(torch.uint8);
+
+            // torch doesn't provide int4, so we use int8 as placeholder
+            // and foreach int8, we save two int4, e.g. 0b1010 -> 0b10, 0b10
+            var zpPlaceHolder = zeroPoint[..placeHolderDim];
+            zpPlaceHolder = zpPlaceHolder * 16 + zeroPoint[placeHolderDim..];
+
+            // assert zero point is in range [-128, 127]
+            //if (torch.any(this.zeroPoint < -128).item<bool>() || torch.any(this.zeroPoint > 127).item<bool>())
+            //{
+            //    throw new Exception("Zero point is out of range [-128, 127]");
+            //}
+
+            // quantize weight
+            var fourBitWeightPlaceHolder = fourBitWeight[..fourBitWeightPlaceHolderDim];
+            fourBitWeightPlaceHolder = fourBitWeightPlaceHolder * 16 + fourBitWeight[fourBitWeightPlaceHolderDim..];
+
+            // assert weight is in range [-128, 127]
+            //if (torch.any(this._8bitWeight < -128).item<bool>() || torch.any(this._8bitWeight > 127).item<bool>())
+            //{
+            //    throw new Exception("Weight is out of range [-128, 127]");
+            //}
+
+            // dispose float32 weight
+            this.weight.Dispose();
+
+            this._internal_buffers.Remove("weight");
+            this.register_buffer("4bit_weight", fourBitWeightPlaceHolder.MoveToOuterDisposeScope());
+            this.register_buffer("zeroPoint", zpPlaceHolder.MoveToOuterDisposeScope());
+            this.register_buffer("scale", scale.MoveToOuterDisposeScope());
+            timer.Stop();
+        }
+        else
+        {
+            // if weight is on meta device, then we just need to create the placeholder for 8bit_weight, zeroPoint and scale
+            var fourBitWeight = torch.zeros(fourBitWeightPlaceHolderDim, dtype: torch.int8);
+            var zeroPoint = torch.zeros(placeHolderDim, dtype: torch.int8);
+            var scale = torch.zeros(this.weight.shape[0], dtype: torch.float32);
+
+            this._internal_buffers.Remove("weight");
+            this.weight = null;
+            this.register_buffer("4bit_weight", fourBitWeight);
+            this.register_buffer("zeroPoint", zeroPoint);
+            this.register_buffer("scale", scale);
+        }
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
index 31b7530b88..49fcfef627 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelInput.cs
@@ -8,15 +8,25 @@ namespace Microsoft.ML.GenAI.Core;
 
 public class CasualLMModelInput
 {
+    internal static class Defaults
+    {
+        internal const Tensor? AttentionMask = null;
+        internal const Tensor? PositionIds = null;
+        internal const int PastKeyValuesLength = 0;
+        internal const Tensor? InputsEmbeds = null;
+        internal const bool UseCache = false;
+        internal const bool OutputAttentions = false;
+        internal const bool OutputHiddenStates = false;
+    }
     public CasualLMModelInput(
         Tensor inputIds,
-        Tensor? attentionMask = null,
-        Tensor? positionIds = null,
-        int pastKeyValuesLength = 0,
-        Tensor? inputsEmbeds = null,
-        bool useCache = false,
-        bool outputAttentions = false,
-        bool outputHiddenStates = false)
+        Tensor? attentionMask = Defaults.AttentionMask,
+        Tensor? positionIds = Defaults.PositionIds,
+        int pastKeyValuesLength = Defaults.PastKeyValuesLength,
+        Tensor? inputsEmbeds = Defaults.InputsEmbeds,
+        bool useCache = Defaults.UseCache,
+        bool outputAttentions = Defaults.OutputAttentions,
+        bool outputHiddenStates = Defaults.OutputHiddenStates)
     {
         this.InputIds = inputIds;
         this.AttentionMask = attentionMask;
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
index 10dde68852..afaa84e778 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CasualLMModelOutput.cs
@@ -8,12 +8,19 @@ namespace Microsoft.ML.GenAI.Core;
 
 public class CasualLMModelOutput
 {
+    internal static class Defaults
+    {
+        internal const Tensor? Logits = null;
+        internal const Tensor[]? AllHiddenStates = null;
+        internal const Tensor[]? Attentions = null;
+        internal const IKVCache? Cache = null;
+    }
     public CasualLMModelOutput(
         Tensor lastHiddenState,
-        Tensor logits,
-        Tensor[]? allHiddenStates = null,
-        Tensor[]? attentions = null,
-        IKVCache? cache = null)
+        Tensor? logits = Defaults.Logits,
+        Tensor[]? allHiddenStates = Defaults.AllHiddenStates,
+        Tensor[]? attentions = Defaults.Attentions,
+        IKVCache? cache = Defaults.Cache)
     {
         this.LastHiddenState = lastHiddenState;
         this.AllHiddenStates = allHiddenStates;
@@ -22,7 +29,7 @@ public CasualLMModelOutput(
         this.Cache = cache;
     }
 
-    public Tensor Logits { get; set; }
+    public Tensor? Logits { get; set; }
 
     public Tensor LastHiddenState { get; set; }
 
diff --git a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
index fa61f7b43a..8c90a94cf7 100644
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
@@ -7,128 +7,267 @@
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core.Extension;
 using Microsoft.ML.Tokenizers;
 using TorchSharp;
 using static TorchSharp.torch;
 
 namespace Microsoft.ML.GenAI.Core;
 
-public class CausalLMPipeline<TTokenizer, TModel> : CausalLMPipeline
+public interface ICausalLMPipeline<out TTokenizer, out TModel> : ICausalLMPipeline
+    where TTokenizer : Tokenizer
+    where TModel : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+{
+    TTokenizer Tokenizer { get; }
+
+    TModel Model { get; }
+}
+
+public interface ICausalLMPipeline
+{
+    string Generate(
+        string prompt,
+        int maxLen = CausalLMPipeline.Defaults.MaxLen,
+        float temperature = CausalLMPipeline.Defaults.Temperature,
+        float topP = CausalLMPipeline.Defaults.TopP,
+        string[]? stopSequences = CausalLMPipeline.Defaults.StopSequence);
+
+    IEnumerable<string> GenerateStreaming(
+        string prompt,
+        int maxLen = CausalLMPipeline.Defaults.MaxLen,
+        float temperature = CausalLMPipeline.Defaults.Temperature,
+        float topP = CausalLMPipeline.Defaults.TopP,
+        string[]? stopSequences = CausalLMPipeline.Defaults.StopSequence);
+
+    (Tensor, Tensor) Generate(
+        Tensor inputIds,
+        Tensor attentionMask,
+        int[][] stopTokenSequence,
+        float temperature = CausalLMPipeline.Defaults.Temperature,
+        float topP = CausalLMPipeline.Defaults.TopP,
+        int maxLen = CausalLMPipeline.Defaults.MaxLen);
+
+    IEnumerable<(Tensor, Tensor)> GenerateStreaming(
+        Tensor inputIds,
+        Tensor attentionMask,
+        int[][] stopTokenSequence,
+        float temperature = CausalLMPipeline.Defaults.Temperature,
+        float topP = CausalLMPipeline.Defaults.TopP,
+        int maxLen = CausalLMPipeline.Defaults.MaxLen);
+}
+
+public class CausalLMPipeline<TTokenizer, TModel> : CausalLMPipeline, ICausalLMPipeline<TTokenizer, TModel>
     where TTokenizer : Tokenizer
     where TModel : nn.Module<CasualLMModelInput, CasualLMModelOutput>
 {
     public CausalLMPipeline(
         TTokenizer tokenizer,
         TModel model,
-        string device = "cpu")
+        string device = Defaults.Device)
         : base(tokenizer, model, device)
     {
     }
+
+    public new TTokenizer Tokenizer { get => (TTokenizer)base.Tokenizer; }
+
+    public new TModel Model { get => (TModel)base.Model; }
 }
 
-public class CausalLMPipeline
+public class CausalLMPipeline : ICausalLMPipeline
 {
+    internal static class Defaults
+    {
+        internal const string Device = "cpu";
+        internal const float Temperature = 0.7F;
+        internal const float TopP = 0.9F;
+        internal const int MaxLen = 128;
+        internal const string[]? StopSequence = null;
+    }
+
     public CausalLMPipeline(
         Tokenizer tokenizer,
         nn.Module<CasualLMModelInput, CasualLMModelOutput> model,
-        string device = "cpu")
+        string device = Defaults.Device)
     {
         this.Tokenizer = tokenizer;
         this.Model = model;
         this.Device = device;
     }
 
+    /// <summary>
+    /// For moq purpose
+    /// </summary>
+    protected private CausalLMPipeline()
+    {
+        this.Tokenizer = default!;
+        this.Model = default!;
+        this.Device = default!;
+    }
+
     public Tokenizer Tokenizer { get; }
 
     public nn.Module<CasualLMModelInput, CasualLMModelOutput> Model { get; }
 
     public Device Device { get; }
 
-    public virtual (
-        Tensor, // output token ids [batch_size, sequence_length]
-        Tensor // output logits [batch_size, sequence_length, vocab_size]
-    ) Generate(
-        Tensor inputIds, // input token ids [batch_size, sequence_length]
-        Tensor attentionMask, // attention mask [batch_size, sequence_length]
+    public IEnumerable<(
+        Tensor, // output token ids [batch_size, 1]
+        Tensor  // output logits [batch_size, 1, vocab_size]
+    )> GenerateStreaming(
+        Tensor inputIds,
+        Tensor attentionMask,
         int[][] stopTokenSequence,
-        float temperature = 0.7f,
-        float topP = 0.9f,
-        int maxLen = 128,
-        bool echo = false)
+        float temperature = Defaults.Temperature,
+        float topP = Defaults.TopP,
+        int maxLen = Defaults.MaxLen)
     {
-        using var newScope = NewDisposeScope();
+        using var scope = NewDisposeScope();
+        using var noGrad = torch.no_grad();
         var batch = inputIds.shape[0];
         var device = inputIds.device;
         var promptLength = (int)inputIds.shape[1];
         var totalLen = promptLength + maxLen;
 
-        using (var noGrad = torch.no_grad())
+        var prevPos = 0;
+        var eosReached = torch.tensor(new bool[batch], device: device);
+        torch.Tensor? logits = default;
+        var cache = new DynamicKVCache();
+        if (promptLength == totalLen)
         {
-            var prevPos = 0;
-            var eosReached = torch.tensor(new bool[batch], device: device);
-            torch.Tensor? logits = default;
-            var cache = new DynamicKVCache();
-            if (promptLength == totalLen)
+            var input = new CasualLMModelInput(inputIds, attentionMask, pastKeyValuesLength: 0)
+            {
+                OverrideCache = cache,
+            };
+            var output = this.Model.forward(input);
+            logits = output.Logits;
+        }
+        for (var curPos = promptLength; curPos != totalLen; curPos++)
+        {
+            var input = new CasualLMModelInput(inputIds[.., prevPos..curPos], attentionMask[.., prevPos..curPos], pastKeyValuesLength: prevPos)
+            {
+                OverrideCache = cache,
+            };
+            var output = this.Model.forward(input);
+            logits = output.Logits?.MoveToOtherDisposeScope(inputIds) ?? throw new InvalidOperationException("Logits is null");
+            torch.Tensor nextToken;
+            if (temperature > 0)
+            {
+                var probs = torch.softmax(logits[.., -1] / temperature, dim: -1);
+                nextToken = this.SampleTopP(probs, topP);
+            }
+            else
+            {
+                nextToken = torch.argmax(logits[.., -1], dim: -1);
+            }
+
+            nextToken = nextToken.reshape(-1);
+            inputIds = torch.cat([inputIds, nextToken.unsqueeze(1)], dim: -1).MoveToOtherDisposeScope(inputIds);
+            attentionMask = torch.cat([attentionMask, attentionMask.new_ones(attentionMask.shape[0], 1)], dim: -1);
+            foreach (var stopSequence in stopTokenSequence)
             {
-                var input = new CasualLMModelInput(inputIds, attentionMask, pastKeyValuesLength: 0)
-                {
-                    OverrideCache = cache,
-                };
-                var output = this.Model.forward(input);
-                logits = output.Logits;
+                // determine if the last n tokens are the stop sequence
+                var lastN = inputIds[.., ^stopSequence.Length..];
+                var lastNMatch = lastN == torch.tensor(stopSequence, device: device);
+                eosReached |= lastNMatch.all(dim: -1);
             }
-            for (var curPos = promptLength; curPos != totalLen; curPos++)
+            if (eosReached.all().item<bool>())
             {
-                var input = new CasualLMModelInput(inputIds[.., prevPos..curPos], attentionMask[.., prevPos..curPos], pastKeyValuesLength: prevPos)
-                {
-                    OverrideCache = cache,
-                };
-                var output = this.Model.forward(input);
-                logits = output.Logits;
-                torch.Tensor nextToken;
-                if (temperature > 0)
-                {
-                    var probs = torch.softmax(logits[.., -1] / temperature, dim: -1);
-                    nextToken = this.SampleTopP(probs, topP);
-                }
-                else
-                {
-                    nextToken = torch.argmax(logits[.., -1], dim: -1);
-                }
-
-                nextToken = nextToken.reshape(-1);
-                inputIds = torch.cat([inputIds, nextToken.unsqueeze(1)], dim: -1);
-                attentionMask = torch.cat([attentionMask, attentionMask.new_ones(attentionMask.shape[0], 1)], dim: -1);
-                foreach (var stopSequence in stopTokenSequence)
-                {
-                    // determine if the last n tokens are the stop sequence
-                    var lastN = inputIds[.., ^stopSequence.Length..];
-                    var lastNMatch = lastN == torch.tensor(stopSequence, device: device);
-                    eosReached |= lastNMatch.all(dim: -1);
-                }
-                if (eosReached.all().item<bool>())
-                {
-                    break;
-                }
-
-                // pBar.Tick(curPos, message);
-                var nextTokenIds = nextToken.to_type(ScalarType.Int32).data<int>().ToArray();
-                var nextTokenStr = this.Tokenizer.Decode(nextTokenIds);
-
-                prevPos = curPos;
+                break;
             }
 
-            if (echo)
+            yield return (nextToken.MoveToOuterDisposeScope(), logits[.., ^1].MoveToOuterDisposeScope());
+            prevPos = curPos;
+        }
+    }
+
+    public virtual (
+        Tensor, // output token ids [batch_size, sequence_length]
+        Tensor // output logits [batch_size, sequence_length, vocab_size]
+    ) Generate(
+        Tensor inputIds, // input token ids [batch_size, sequence_length]
+        Tensor attentionMask, // attention mask [batch_size, sequence_length]
+        int[][] stopTokenSequence,
+        float temperature = 0.7f,
+        float topP = 0.9f,
+        int maxLen = 128)
+    {
+        using var scope = NewDisposeScope();
+        Tensor? logits = null;
+        foreach (var (token, _logits) in this.GenerateStreaming(inputIds, attentionMask, stopTokenSequence, temperature, topP, maxLen))
+        {
+            inputIds = torch.cat([inputIds, token.unsqueeze(1)], dim: -1).MoveToOtherDisposeScope(inputIds);
+            if (logits is null)
             {
-                // return entire inputIds and logits
-                return (inputIds.MoveToOuterDisposeScope(), logits!.MoveToOuterDisposeScope());
+                logits = _logits;
             }
             else
             {
-                // return [batch_size, promptLength..] and [batch_size, promptLength.., vocab_size]
-                return (inputIds[.., promptLength..].MoveToOuterDisposeScope(), logits![.., promptLength..].MoveToOuterDisposeScope());
+                logits = torch.cat([logits, _logits], dim: -1).MoveToOtherDisposeScope(inputIds);
             }
         }
+
+        return (inputIds, logits ?? throw new InvalidOperationException("Logits is null"));
+    }
+
+    public virtual string Generate(
+        string prompt,
+        int maxLen = 128,
+        float temperature = 0.7f,
+        float topP = 0.9f,
+        string[]? stopSequences = null)
+    {
+        var chunks = new List<string>();
+
+        foreach (var chunk in this.GenerateStreaming(prompt, maxLen, temperature, topP, stopSequences))
+        {
+            chunks.Add(chunk);
+        }
+
+        return string.Join(string.Empty, chunks);
+    }
+
+
+    public virtual IEnumerable<string> GenerateStreaming(
+        string prompt,
+        int maxLen = 128,
+        float temperature = 0.7F,
+        float topP = 0.9F,
+        string[]? stopSequences = Defaults.StopSequence)
+    {
+        using var newScope = NewDisposeScope();
+        var inputIds = this.Tokenizer.EncodeToIds(prompt);
+        var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: this.Device).unsqueeze(0);
+        var attentionMask = torch.ones_like(inputTensor, device: this.Device);
+        // set up stop token ids
+        // stop token ids: [[eosId], [stopSequence1], [stopSequence2], ...]
+        // when causal language model generates tokens, it will stop when it generates any token in stopSequences
+        List<int[]> stopTokenIds = [[]];
+        if (stopSequences != null)
+        {
+            stopTokenIds.AddRange(stopSequences.Select(x =>
+            {
+                var tokens = this.Tokenizer.EncodeToTokens(x, out var _, false, false);
+
+                return tokens
+                // Skip the first _ token automatically added by tokenizer
+                .Where(t => t.Offset != (0, 0))
+                .Select(t => t.Id)
+                .ToArray();
+            }));
+        }
+
+        stopTokenIds = stopTokenIds.Where(ids => ids.Count() > 0).ToList();
+
+        foreach (var (token, _) in this.GenerateStreaming(inputTensor, attentionMask, stopTokenIds.ToArray(), temperature: temperature, maxLen: maxLen))
+        {
+            var tokenIds = token[0].to_type(ScalarType.Int32).data<int>().ToArray();
+            var duplicateTokenString = this.Tokenizer.Decode(tokenIds.Concat(tokenIds)) ?? throw new InvalidOperationException("Failed to decode token ids");
+            var tokenString = this.Tokenizer.Decode(tokenIds) ?? throw new InvalidOperationException("Failed to decode token ids");
+            // replace the first occurrence of the token with the duplicate token
+            tokenString = duplicateTokenString.Substring(tokenString.Length);
+
+            yield return tokenString;
+        }
     }
 
     protected torch.Tensor SampleTopP(torch.Tensor logits, float topP)
diff --git a/src/Microsoft.ML.GenAI.Phi/Extension/SemanticKernelExtension.cs b/src/Microsoft.ML.GenAI.Phi/Extension/SemanticKernelExtension.cs
new file mode 100644
index 0000000000..ace7a7b425
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Extension/SemanticKernelExtension.cs
@@ -0,0 +1,33 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.Tokenizers;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.ChatCompletion;
+using Microsoft.SemanticKernel.TextGeneration;
+
+namespace Microsoft.ML.GenAI.Phi.Extension;
+
+public static class SemanticKernelExtension
+{
+    public static IKernelBuilder AddGenAIChatCompletion(
+        this IKernelBuilder builder,
+        ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline)
+    {
+        builder.Services.AddSingleton<IChatCompletionService>(new Phi3CausalLMChatCompletionService(pipeline));
+
+        return builder;
+    }
+
+    public static IKernelBuilder AddGenAITextGeneration(
+        this IKernelBuilder builder,
+        ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline)
+    {
+        builder.Services.AddSingleton<ITextGenerationService>(new Phi3CausalLMTextGenerationService(pipeline));
+
+        return builder;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj b/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj
new file mode 100644
index 0000000000..a9556443dd
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj
@@ -0,0 +1,37 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFrameworks>net6.0;net8.0</TargetFrameworks>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="AutoGen.Core" Version="$(AutoGenVersion)" />
+    <PackageReference Include="TorchSharp.PyBridge" Version="$(TorchSharpPyBridgeVersion)" />
+    <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
+    <PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="$(SemanticKernelVersion)" />
+    
+  </ItemGroup>
+
+  <!-- <ItemGroup Condition="'$(Configuration)' == 'Debug'">
+    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows'))" PrivateAssets="all" />
+    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux'))" PrivateAssets="all" />
+    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX'))" PrivateAssets="all" />
+  </ItemGroup> -->
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" PrivateAssets="all" />
+    <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" PrivateAssets="all" />
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <InternalsVisibleTo Include="Microsoft.ML.GenAI.Phi.Tests" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <EmbeddedResource Include="Resource\Config\*.json" />
+  </ItemGroup>
+    
+</Project>
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi2Attention.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi2Attention.cs
new file mode 100644
index 0000000000..918ae7c99b
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi2Attention.cs
@@ -0,0 +1,159 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics.Contracts;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+
+internal class Phi2Attention : nn.Module<
+    Tensor, // hidden_states
+    Tensor, // position_ids
+    Tensor?, // attention_mask
+    int, // past_key_value_length
+    bool, // output_attentions
+    (
+        Tensor, // hidden_states,
+        Tensor?, // attentions,
+        Tensor? // present_key_value
+    )>
+{
+    private readonly int? _layerIdx;
+    private readonly Phi2Config _config;
+    private readonly double _attentionDropout;
+    private readonly int _hiddenSize;
+    private readonly int _numAttentionHeads;
+    private readonly int _headDim;
+    private readonly int _numKeyValueHeads;
+    private readonly int _numKeyValueGroups;
+    private readonly int _maxPositionEmbeddings;
+    private readonly double _ropeTheta;
+    private readonly double _partialRotaryFactor;
+    private readonly bool _qkLayernorm;
+
+    // we disable the warning for the private field name not in _camelCase format for all submodules fields
+    // because their name will be used as keys to load the corresponding weights from the checkpoint
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly GenAILinear q_proj;
+    private readonly GenAILinear k_proj;
+    private readonly GenAILinear v_proj;
+    private readonly GenAILinear dense;
+    private readonly LayerNorm? q_layernorm;
+    private readonly LayerNorm? k_layernorm;
+
+    private readonly Phi2RotaryEmbedding phiRotaryEmbedding;
+
+    // cache_k, cache_v
+    private Tensor cache_k;
+    private Tensor cache_v;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi2Attention(Phi2Config config, int? layerIdx = null, int maxBatch = 2, int maxLength = 1024)
+        : base(nameof(Phi2Attention))
+    {
+        this._layerIdx = layerIdx;
+        this._config = config;
+        this._attentionDropout = config.AttentionDropout;
+        this._hiddenSize = config.HiddenSize;
+        this._numAttentionHeads = config.NumAttentionHeads;
+        this._headDim = this._hiddenSize / this._numAttentionHeads;
+        this._numKeyValueHeads = config.NumKeyValueHeads ?? throw new ArgumentException("num_key_value_heads must be specified");
+        this._numKeyValueGroups = this._numAttentionHeads / this._numKeyValueHeads;
+        this._maxPositionEmbeddings = config.MaxPositionEmbeddings;
+        this._ropeTheta = config.RopeTheta;
+        this._partialRotaryFactor = config.PartialRotaryFactor;
+
+        Contract.Assert(this._hiddenSize % (this._headDim * this._numAttentionHeads) == 0, "hidden_size must be divisible by num_attention_heads");
+        this.q_proj = new GenAILinear(this._hiddenSize, this._numAttentionHeads * this._headDim, hasBias: true, dtype: config.Dtype);
+        this.k_proj = new GenAILinear(this._hiddenSize, this._numKeyValueHeads * this._headDim, hasBias: true, dtype: config.Dtype);
+        this.v_proj = new GenAILinear(this._hiddenSize, this._numKeyValueHeads * this._headDim, hasBias: true, dtype: config.Dtype);
+        this.dense = new GenAILinear(this._numAttentionHeads * this._headDim, this._hiddenSize, hasBias: true, dtype: config.Dtype);
+
+        this._qkLayernorm = config.QkLayernorm;
+        if (this._qkLayernorm)
+        {
+            this.q_layernorm = nn.LayerNorm(this._hiddenSize / this._numAttentionHeads, eps: config.LayerNormEps, elementwise_affine: true, dtype: config.Dtype);
+            this.k_layernorm = nn.LayerNorm(this._hiddenSize / this._numAttentionHeads, eps: config.LayerNormEps, elementwise_affine: true, dtype: config.Dtype);
+        }
+
+        this.RegisterComponents();
+        this.phiRotaryEmbedding = new Phi2RotaryEmbedding(
+            dim: (int)(this._partialRotaryFactor * this._headDim),
+            maxPositionEmbeddings: this._maxPositionEmbeddings,
+            baseValue: this._config.RopeTheta);
+        this.cache_k = torch.zeros(maxBatch, this._numKeyValueHeads, maxLength, this._headDim, dtype: config.Dtype);
+        this.cache_v = torch.zeros(maxBatch, this._numKeyValueHeads, maxLength, this._headDim, dtype: config.Dtype);
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override (Tensor, Tensor?, Tensor?) forward(
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+        Tensor hiddenStates,
+        Tensor positionIds,
+        Tensor? attentionMask = null,
+        int pastKeyValueLength = 0,
+        bool outputAttentions = false)
+    {
+        // move cache to the same device as hiddenStates
+        if (this.cache_k.device != hiddenStates.device)
+        {
+            this.cache_k = this.cache_k.to(hiddenStates.device, disposeAfter: true).DetachFromDisposeScope();
+            this.cache_v = this.cache_v.to(hiddenStates.device, disposeAfter: true).DetachFromDisposeScope();
+        }
+
+        using var disposeScope = torch.NewDisposeScope();
+        var batchSize = (int)hiddenStates.shape[0];
+        var seqLen = (int)hiddenStates.shape[1];
+
+        var queryStates = this.q_proj.forward(hiddenStates);
+        var keyStates = this.k_proj.forward(hiddenStates);
+        var valueStates = this.v_proj.forward(hiddenStates);
+        if (this._qkLayernorm)
+        {
+            queryStates = this.q_layernorm!.forward(queryStates);
+            keyStates = this.k_layernorm!.forward(keyStates);
+        }
+
+        queryStates = queryStates.view(batchSize, seqLen, this._numAttentionHeads, this._headDim).transpose_(1, 2);
+        keyStates = keyStates.view(batchSize, seqLen, this._numKeyValueHeads, this._headDim).transpose_(1, 2);
+        valueStates = valueStates.view(batchSize, seqLen, this._numKeyValueHeads, this._headDim).transpose_(1, 2);
+        var kvSeqLen = pastKeyValueLength == 0 ? (int)keyStates.shape[2] : pastKeyValueLength + (int)keyStates.shape[2];
+        (var cos, var sin) = this.phiRotaryEmbedding.forward(valueStates, kvSeqLen);
+        // split the last dim of queryStates and keyStates into rotary and non-rotary parts
+        // shape: [batch_size, num_heads, seq_len, head_dim]
+        // queryRot: [batch_size, num_heads, seq_len, :head_dim * partial_rotary_factor]
+        // queryPass: [batch_size, num_heads, seq_len, head_dim * partial_rotary_factor:]
+        var keyRot = keyStates[.., .., .., ..this.phiRotaryEmbedding.Dim];
+        var keyPass = keyStates[.., .., .., this.phiRotaryEmbedding.Dim..];
+        var queryRot = queryStates[.., .., .., ..this.phiRotaryEmbedding.Dim];
+        var queryPass = queryStates[.., .., .., this.phiRotaryEmbedding.Dim..];
+        (var qRot, var kRot) = Utils.ApplyRotaryPosEmb(queryRot, keyRot, cos, sin, positionIds);
+
+        queryStates = torch.cat([qRot, queryPass], dim: -1);
+        // update cache
+        keyStates = torch.cat([kRot, keyPass], dim: -1);
+        this.cache_k[..batchSize, .., pastKeyValueLength..kvSeqLen, ..] = keyStates;
+        this.cache_v[..batchSize, .., pastKeyValueLength..kvSeqLen, ..] = valueStates;
+        keyStates = this.cache_k[..batchSize, .., ..kvSeqLen, ..];
+        valueStates = this.cache_v[..batchSize, .., ..kvSeqLen, ..];
+        var keyStates2 = Utils.Phi2RepeatKV(keyStates, this._numKeyValueGroups).transpose(2, 3);
+        var valueStates2 = Utils.Phi2RepeatKV(valueStates, this._numKeyValueGroups);
+        // Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow
+        var attnWeights = torch.matmul(queryStates.to_type(float32), keyStates2.to_type(float32));
+        attnWeights = attnWeights / Math.Sqrt(this._headDim);
+        if (attentionMask is not null)
+        {
+            attnWeights = attnWeights + attentionMask;
+        }
+        attnWeights = nn.functional.softmax(attnWeights, dim: -1);
+        attnWeights = nn.functional.dropout(attnWeights, p: this._attentionDropout);
+        var attnOutput = torch.matmul(attnWeights, valueStates2.to_type(float32)).to_type(hiddenStates.dtype);
+        attnOutput = attnOutput.transpose_(1, 2).contiguous();
+        attnOutput = attnOutput.reshape(batchSize, seqLen, this._hiddenSize);
+        var result = this.dense.forward(attnOutput);
+        return (result.MoveToOuterDisposeScope(), null, null);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi2DecoderLayer.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi2DecoderLayer.cs
new file mode 100644
index 0000000000..7931e32b79
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi2DecoderLayer.cs
@@ -0,0 +1,66 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+public class Phi2DecoderLayer : nn.Module<
+    Tensor, // hidden_states
+    Tensor, // position_ids
+    Tensor?, // attention_mask
+    int, // past_key_value_length
+    bool, // use_cache
+    bool, // output_attentions
+    (
+        Tensor, // hidden_states,
+        Tensor?, // attentions,
+        Tensor? // present_key_value
+    )>
+{
+    private readonly int? _layerIdx;
+
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly Phi2Attention self_attn;
+    private readonly Phi2MLP mlp;
+    private readonly LayerNorm input_layernorm;
+    private readonly Dropout resid_dropout;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi2DecoderLayer(Phi2Config config, int? layerIdx = null)
+        : base(nameof(Phi2DecoderLayer))
+    {
+        this._layerIdx = layerIdx;
+        this.self_attn = new Phi2Attention(config, layerIdx);
+        this.mlp = new Phi2MLP(config);
+        this.input_layernorm = nn.LayerNorm(config.HiddenSize, eps: config.LayerNormEps, dtype: config.Dtype);
+        this.resid_dropout = nn.Dropout(config.ResidPdrop);
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override (Tensor, Tensor?, Tensor?) forward(
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+        Tensor hiddenStates,
+        Tensor positionIds,
+        Tensor? attentionMask = null,
+        int pastKeyValueLength = 0,
+        bool useCache = false,
+        bool outputAttentions = false)
+    {
+        using var disposiableScope = torch.NewDisposeScope();
+        var residual = hiddenStates;
+        hiddenStates = this.input_layernorm.forward(hiddenStates);
+        (var attnOutput, var attnWeights, var presentKeyValue) = this.self_attn.forward(
+            hiddenStates: hiddenStates,
+            positionIds: positionIds,
+            attentionMask: attentionMask,
+            pastKeyValueLength: pastKeyValueLength,
+            outputAttentions: outputAttentions);
+        var feedForwardHiddenStates = this.mlp.forward(hiddenStates);
+        hiddenStates = residual + feedForwardHiddenStates + attnOutput;
+
+        return (hiddenStates.MoveToOuterDisposeScope(), null, null);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi2MLP.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi2MLP.cs
new file mode 100644
index 0000000000..384d012e22
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi2MLP.cs
@@ -0,0 +1,37 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+internal class Phi2MLP : torch.nn.Module<Tensor, Tensor>
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+{
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly GenAILinear fc1;
+    private readonly GenAILinear fc2;
+    private readonly torch.nn.Module<Tensor, Tensor> activation_fn;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi2MLP(Phi2Config config)
+        : base(nameof(Phi2MLP))
+    {
+        this.fc1 = new GenAILinear(config.HiddenSize, config.IntermediateSize, dtype: config.Dtype);
+        this.fc2 = new GenAILinear(config.IntermediateSize, config.HiddenSize, dtype: config.Dtype);
+        this.activation_fn = new NewGELUActivation();
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override Tensor forward(Tensor input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        using var input1 = this.fc1.forward(input);
+        using var input2 = this.activation_fn.forward(input1);
+        return this.fc2.forward(input2);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi2Model.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi2Model.cs
new file mode 100644
index 0000000000..b96e0409f9
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi2Model.cs
@@ -0,0 +1,158 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Diagnostics.Contracts;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+
+internal class Phi2Model : nn.Module<
+    Tensor, // input_ids
+    Tensor?, // attention_mask
+    int, // past_key_value_length
+    Tensor?, // position_ids
+    Tensor?, //input embeddings
+    (
+        bool, // use_cache
+        bool, // output_attentions
+        bool // output_hidden_states
+    ),
+    (
+        Tensor, // hidden_states,
+        Tensor?, // attentions,
+        Tensor? // present_key_value
+    )>
+{
+    private readonly Phi2Config _config;
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly Embedding embed_tokens;
+    private readonly Dropout embed_dropout;
+    private readonly LayerNorm final_layernorm;
+    private readonly ModuleList<Phi2DecoderLayer> layers;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi2Model(Phi2Config config)
+        : base(nameof(Phi2Model))
+    {
+        this._config = config;
+        this.embed_tokens = nn.Embedding(config.VocabSize, config.HiddenSize, dtype: config.Dtype);
+        this.embed_dropout = nn.Dropout(config.EmbdPdrop);
+        this.final_layernorm = nn.LayerNorm(config.HiddenSize, eps: config.LayerNormEps, dtype: config.Dtype);
+        this.layers = new ModuleList<Phi2DecoderLayer>(Enumerable.Range(0, config.NumHiddenLayers).Select(i => new Phi2DecoderLayer(config)).ToArray());
+        this.RegisterComponents();
+    }
+
+    public Phi2Config Config => this._config;
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override (Tensor, Tensor?, Tensor?) forward(
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+        Tensor inputIds,
+        Tensor? attentionMask = null,
+        int pastKeyValueLength = 0,
+        Tensor? positionIds = null,
+        Tensor? inputEmbeddings = null,
+        (bool, bool, bool) options = default) // use_cache, output_attentions, output_hidden_states
+    {
+        (var outputAttentions, var outputHiddenStates, var useCache) = options;
+
+        // TODO
+        // add support for inputEmbeddings
+        if (inputEmbeddings is not null)
+        {
+            throw new NotImplementedException("inputEmbeddings is not supported");
+        }
+        inputEmbeddings = this.embed_tokens.forward(inputIds);
+        inputEmbeddings = this.embed_dropout.forward(inputEmbeddings);
+        var batchSize = inputIds.shape[0];
+        var seqLen = (int)inputIds.shape[1];
+
+        if (positionIds is null)
+        {
+            positionIds = torch.arange(pastKeyValueLength, seqLen + pastKeyValueLength, dtype: inputIds.dtype, device: inputIds.device);
+            positionIds = positionIds.unsqueeze(0);
+        }
+
+        // attention
+        // use 4d attention mask
+        if (attentionMask is not null)
+        {
+            attentionMask = this.Prepare4DCasualAttentionMask(attentionMask, seqLen, pastKeyValueLength, inputEmbeddings.dtype);
+        }
+
+        var hiddenStates = inputEmbeddings;
+
+        for (int i = 0; i < this.layers.Count; i++)
+        {
+            (hiddenStates, _, _) = this.layers[i].forward(
+                hiddenStates: hiddenStates,
+                positionIds: positionIds,
+                attentionMask: attentionMask,
+                pastKeyValueLength: pastKeyValueLength,
+                useCache: useCache,
+                outputAttentions: outputAttentions);
+        }
+
+        hiddenStates = this.final_layernorm.forward(hiddenStates);
+        return (hiddenStates, null, null);
+    }
+
+    private Tensor Prepare4DCasualAttentionMask(
+        Tensor attentionMask,
+        int queryLength,
+        int pastKeyValueLength,
+        ScalarType dtype)
+    {
+        var batchSize = (int)attentionMask.shape[0];
+        var seqLen = attentionMask.shape[1];
+        Contract.Assert(seqLen == queryLength, "seqLen must be equal to queryLength");
+        var targetLength = queryLength + pastKeyValueLength;
+        var casual4DMask = this.MakeCasualAttentionMask(batchSize, queryLength, pastKeyValueLength, attentionMask.device, dtype);
+        var expandedMask = this.ExpandMask(attentionMask, dtype, queryLength).to(attentionMask.device);
+
+        casual4DMask.masked_fill_(expandedMask.to_type(ScalarType.Bool), torch.finfo(dtype).min);
+        return casual4DMask;
+    }
+
+    private Tensor ExpandMask(
+        Tensor mask,
+        ScalarType dtype,
+        int targetLength)
+    {
+        var batch = mask.shape[0];
+        var seqLen = mask.shape[1];
+        var expandedMask = mask.unsqueeze(1).unsqueeze(2);
+        expandedMask = expandedMask.expand(new long[] { batch, 1, targetLength, seqLen });
+        expandedMask = expandedMask.to_type(dtype);
+
+        var invertedMask = (1.0f - expandedMask) > 0;
+
+        return invertedMask.masked_fill(invertedMask.to_type(ScalarType.Bool), torch.finfo(dtype).min);
+    }
+    private Tensor MakeCasualAttentionMask(
+        int batchSize,
+        int targetLen,
+        int pastKeyValueLength,
+        Device device,
+        ScalarType dtype)
+    {
+        var mask = torch.full([targetLen, targetLen], torch.finfo(dtype).min, dtype: dtype, device: device);
+        var maskCond = torch.arange(mask.size(-1), device: device);
+        mask.masked_fill_(maskCond < (maskCond + 1).view(mask.size(-1), 1), 0.0f);
+
+        mask = mask.to_type(dtype);
+
+        if (pastKeyValueLength > 0)
+        {
+            mask = torch.cat([torch.zeros([targetLen, pastKeyValueLength], dtype: dtype, device: device), mask], dim: -1);
+        }
+
+        mask = mask.unsqueeze(0).unsqueeze(0);
+        mask = mask.expand(new long[] { batchSize, 1, targetLen, targetLen + pastKeyValueLength });
+
+        return mask;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi2RotaryEmbedding.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi2RotaryEmbedding.cs
new file mode 100644
index 0000000000..a21ed4959e
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi2RotaryEmbedding.cs
@@ -0,0 +1,49 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+internal class Phi2RotaryEmbedding : nn.Module<
+    Tensor, // input
+    int, // seq_len
+    (
+        Tensor, // cos
+        Tensor // sin
+    )>
+{
+    private readonly double _base;
+    private readonly int _maxPositionEmbeddings;
+    private readonly int _dim;
+
+    public Phi2RotaryEmbedding(double baseValue, int maxPositionEmbeddings, int dim)
+        : base(nameof(Phi2RotaryEmbedding))
+    {
+        _base = baseValue;
+        _maxPositionEmbeddings = maxPositionEmbeddings;
+        _dim = dim;
+        var thetaNumerator = torch.arange(0, _dim, 2, dtype: ScalarType.Int64).to(torch.float32);
+        this.register_buffer("inv_freq", torch.pow(baseValue, -1.0f * (thetaNumerator / dim)), persistent: false);
+    }
+
+    public int Dim => _dim;
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override (Tensor, Tensor) forward(Tensor x, int seqLen)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        // TODO
+        // can be calculated once and cached
+        var invFreq = this.get_buffer("inv_freq").to(x.device);
+        var t = torch.arange(seqLen, dtype: invFreq.dtype, device: invFreq.device);
+        var freqs = torch.outer(t, invFreq).to(torch.float32);
+        var emb = torch.cat([freqs, freqs], dim: -1);
+
+        var cos = torch.cos(emb);
+        var sin = torch.sin(emb);
+
+        return (cos[..seqLen].to_type(x.dtype), sin[..seqLen].to_type(x.dtype));
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs
new file mode 100644
index 0000000000..72c7c8946a
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Attention.cs
@@ -0,0 +1,196 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Diagnostics.Contracts;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+
+internal class Phi3AttentionInput
+{
+    public Phi3AttentionInput(
+        Tensor hiddenStates,
+        Tensor positionIds,
+        Tensor? attentionMask = null,
+        IKVCache? cache = null,
+        bool outputAttentions = false)
+    {
+        this.HiddenStates = hiddenStates;
+        this.AttentionMask = attentionMask;
+        this.PositionIds = positionIds;
+        this.Cache = cache;
+        this.OutputAttentions = outputAttentions;
+    }
+    public Tensor HiddenStates { get; set; }
+
+    public Tensor? AttentionMask { get; set; }
+
+    public Tensor PositionIds { get; set; }
+
+    public IKVCache? Cache { get; set; }
+
+    public bool OutputAttentions { get; set; }
+}
+
+internal class Phi3AttentionOutput
+{
+    public Phi3AttentionOutput(
+        Tensor hiddenStates,
+        Tensor? attentions = null,
+        IKVCache? cache = null)
+    {
+        this.HiddenStates = hiddenStates;
+        this.Attentions = attentions;
+        this.Cache = cache;
+    }
+
+    public Tensor HiddenStates { get; set; }
+
+    public Tensor? Attentions { get; set; }
+
+    public IKVCache? Cache { get; set; }
+}
+
+internal class Phi3Attention : nn.Module<Phi3AttentionInput, Phi3AttentionOutput>
+{
+    private readonly Phi3Config _config;
+    private readonly int _layerIdx;
+    private readonly double _attentionDropout;
+    private readonly int _hiddenSize;
+    private readonly int _numHeads;
+    private readonly int _headDim;
+    private readonly int _numKeyValueHeads;
+    private readonly int _numKeyValueGroups;
+    private readonly int _maxPositionEmbeddings;
+    private readonly int _originalMaxPositionEmbeddings;
+    private readonly double _ropeTheta;
+    private readonly Dictionary<string, object>? _ropeScaling;
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly QuantizedLinear o_proj;
+    private readonly QuantizedLinear qkv_proj;
+    private nn.Module<Phi3RotaryEmbeddingInput, Phi3RotaryEmbeddingOutput> rotary_emb = null!;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi3Attention(Phi3Config config, int layerIdx)
+        : base(nameof(Phi3Attention))
+    {
+        this._config = config;
+        this._layerIdx = layerIdx;
+        this._attentionDropout = config.AttentionDropout;
+        this._hiddenSize = config.HiddenSize;
+        this._numHeads = config.NumAttentionHeads;
+        this._headDim = this._hiddenSize / this._numHeads;
+        this._numKeyValueHeads = config.NumKeyValueHeads ?? throw new ArgumentException("num_key_value_heads must be specified");
+        this._numKeyValueGroups = this._numHeads / this._numKeyValueHeads;
+        this._maxPositionEmbeddings = config.MaxPositionEmbeddings;
+        this._originalMaxPositionEmbeddings = config.OriginalMaxPositionEmbeddings;
+        this._ropeTheta = config.RopeTheta;
+        this._ropeScaling = config.RopeScaling;
+
+        Contract.Assert(this._hiddenSize % (this._headDim * this._numHeads) == 0, "hidden_size must be divisible by num_heads");
+
+        var opSize = this._numHeads * this._headDim + 2 * (this._numKeyValueHeads * this._headDim);
+        this.o_proj = new QuantizedLinear(this._numHeads * this._headDim, this._hiddenSize, hasBias: false, dtype: config.DType);
+        this.qkv_proj = new QuantizedLinear(this._hiddenSize, opSize, hasBias: false, dtype: config.DType);
+        this.InitRope();
+    }
+
+    private void InitRope()
+    {
+        if (this._ropeScaling is null)
+        {
+            this.rotary_emb = new Phi3RotaryEmbedding(this._ropeTheta, this._maxPositionEmbeddings, this._headDim);
+        }
+        else
+        {
+            this.rotary_emb = new Phi3SuScaledRotaryEmbedding(this._headDim, this._config);
+        }
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override Phi3AttentionOutput forward(Phi3AttentionInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        using (var _ = NewDisposeScope())
+        {
+            var hiddenStates = input.HiddenStates;
+            var positionIds = input.PositionIds;
+            var outputAttentions = input.OutputAttentions;
+            var bsz = hiddenStates.shape[0];
+            var qLen = hiddenStates.shape[1];
+
+            var qkv = this.qkv_proj.forward(hiddenStates);
+            var queryPos = this._numHeads * this._headDim;
+            var queryStates = qkv[.., .., ..queryPos];
+            var keyStates = qkv[.., .., queryPos..(queryPos + this._numKeyValueHeads * this._headDim)];
+            var valueStates = qkv[.., .., (queryPos + this._numKeyValueHeads * this._headDim)..];
+            queryStates = queryStates.view(bsz, qLen, this._numHeads, this._headDim).transpose(1, 2);
+            keyStates = keyStates.view(bsz, qLen, this._numKeyValueHeads, this._headDim).transpose(1, 2);
+            valueStates = valueStates.view(bsz, qLen, this._numKeyValueHeads, this._headDim).transpose(1, 2);
+
+            var kvSeqLen = keyStates.IntShape()[^2];
+            var pastKeyValue = input.Cache;
+            if (pastKeyValue is not null)
+            {
+                kvSeqLen += pastKeyValue.GetUsableLength(kvSeqLen, this._layerIdx);
+            }
+
+            var embOutput = this.rotary_emb.forward(new Phi3RotaryEmbeddingInput(valueStates, positionIds, kvSeqLen));
+            (var cos, var sin) = (embOutput.Cos, embOutput.Sin);
+
+            (queryStates, keyStates) = Utils.ApplyRotaryPosEmb(queryStates, keyStates, cos, sin);
+
+            if (pastKeyValue is not null)
+            {
+                (keyStates, valueStates) = pastKeyValue.UpdateKVCache(keyStates, valueStates, this._layerIdx);
+            }
+
+            // repeat k/v heads if n_kv_heads < n_heads
+            keyStates = Utils.Phi3RepeatKV(keyStates, this._numKeyValueGroups);
+            valueStates = Utils.Phi3RepeatKV(valueStates, this._numKeyValueGroups);
+
+            var attnWeights = torch.matmul(queryStates, keyStates.transpose(2, 3));
+            attnWeights = attnWeights / Math.Sqrt(this._headDim);
+
+            // attnWeight's shape should be [bsz, this._numHeads, qLen, kvSeqLen]
+            Contract.Assert(attnWeights.shape.Length == 4);
+            Contract.Assert(attnWeights.shape[0] == bsz);
+            Contract.Assert(attnWeights.shape[1] == this._numHeads);
+            Contract.Assert(attnWeights.shape[2] == qLen);
+            Contract.Assert(attnWeights.shape[3] == kvSeqLen);
+
+            var attentionMask = input.AttentionMask;
+            if (attentionMask is not null)
+            {
+                Contract.Assert(attentionMask.shape.Length == 4);
+                Contract.Assert(attentionMask.shape[0] == bsz);
+                Contract.Assert(attentionMask.shape[1] == 1);
+                Contract.Assert(attentionMask.shape[2] == qLen);
+                Contract.Assert(attentionMask.shape[3] == kvSeqLen);
+                attnWeights = attnWeights + attentionMask;
+            }
+
+            // upscale attention to fp32 to avoid overflow
+            attnWeights = nn.functional.softmax(attnWeights, dim: -1, dtype: ScalarType.Float32).to(valueStates.dtype);
+            attnWeights = nn.functional.dropout(attnWeights, this._attentionDropout, this.training);
+
+            var attnOutput = torch.matmul(attnWeights, valueStates);
+
+            attnOutput = attnOutput.transpose(1, 2).contiguous();
+            attnOutput = attnOutput.reshape(bsz, qLen, this._hiddenSize);
+
+            attnOutput = this.o_proj.forward(attnOutput);
+
+            return new(attnOutput.MoveToOuterDisposeScope(), outputAttentions ? attnWeights.MoveToOuterDisposeScope() : null, pastKeyValue);
+        }
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
new file mode 100644
index 0000000000..399cd25646
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3DecoderLayer.cs
@@ -0,0 +1,129 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+
+internal class Phi3DecoderLayerInput
+{
+    public Phi3DecoderLayerInput(
+        Tensor hiddenStates,
+        Tensor attentionMask,
+        Tensor positionIds,
+        IKVCache? pastKeyValue = null,
+        bool outputAttentions = false)
+    {
+        this.HiddenStates = hiddenStates;
+        this.AttentionMask = attentionMask;
+        this.PositionIds = positionIds;
+        this.PastKeyValue = pastKeyValue;
+        this.OutputAttentions = outputAttentions;
+    }
+
+    public Tensor HiddenStates { get; set; }
+
+    public Tensor AttentionMask { get; set; }
+
+    public Tensor PositionIds { get; set; }
+
+    public IKVCache? PastKeyValue { get; set; }
+
+    public bool OutputAttentions { get; set; }
+}
+
+internal class Phi3DecoderLayerOutput
+{
+    public Phi3DecoderLayerOutput(
+        Tensor hiddenStates,
+        Tensor? attentions = null,
+        IKVCache? pastKeyValue = null)
+    {
+        this.HiddenStates = hiddenStates;
+        this.Attentions = attentions;
+        this.PastKeyValue = pastKeyValue;
+    }
+
+    public Tensor HiddenStates { get; set; }
+
+    public Tensor? Attentions { get; set; }
+
+    public IKVCache? PastKeyValue { get; set; }
+}
+
+internal class Phi3DecoderLayer : nn.Module<Phi3DecoderLayerInput, Phi3DecoderLayerOutput>, IDynamicLoadModule
+{
+    private readonly Phi3Config _config;
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly nn.Module<Phi3AttentionInput, Phi3AttentionOutput> self_attn;
+    private readonly Phi3MLP mlp;
+    private readonly Phi3RMSNorm input_layernorm;
+    private readonly Dropout resid_attn_dropout;
+    private readonly Dropout resid_mlp_dropout;
+    private readonly Phi3RMSNorm post_attention_layernorm;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi3DecoderLayer(Phi3Config config, int layerIdx)
+        : base(nameof(Phi3DecoderLayer))
+    {
+        this._config = config;
+        if (config.AttnImplementation == "eager")
+        {
+            this.self_attn = new Phi3Attention(config, layerIdx);
+        }
+        else
+        {
+            throw new NotImplementedException();
+        }
+
+        this.mlp = new Phi3MLP(config);
+        this.input_layernorm = new Phi3RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
+
+        this.resid_attn_dropout = nn.Dropout(config.ResidPdrop);
+        this.resid_mlp_dropout = nn.Dropout(config.ResidPdrop);
+        this.post_attention_layernorm = new Phi3RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
+    }
+
+    public Action<nn.Module>? LoadToDeviceFunc { get; set; }
+
+    public Action<nn.Module>? UnloadFromDeviceFunc { get; set; }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override Phi3DecoderLayerOutput forward(Phi3DecoderLayerInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        if (LoadToDeviceFunc != null)
+        {
+            LoadToDeviceFunc(this);
+        }
+        using var disposeScope = NewDisposeScope();
+        var hiddenStates = input.HiddenStates;
+        var residual = input.HiddenStates;
+        hiddenStates = this.input_layernorm.forward(hiddenStates);
+
+        var attentionInput = new Phi3AttentionInput(hiddenStates, input.PositionIds, input.AttentionMask, input.PastKeyValue, input.OutputAttentions);
+        var output = this.self_attn.forward(attentionInput);
+        var attnOutputs = output.HiddenStates;
+        var selfAttnWeights = output.Attentions;
+        var presentKeyValue = output.Cache;
+        hiddenStates = residual + this.resid_attn_dropout.forward(attnOutputs);
+        residual = hiddenStates;
+        hiddenStates = this.post_attention_layernorm.forward(hiddenStates);
+        hiddenStates = this.mlp.forward(hiddenStates);
+        hiddenStates = residual + this.resid_mlp_dropout.forward(hiddenStates);
+
+        if (UnloadFromDeviceFunc != null)
+        {
+            UnloadFromDeviceFunc(this);
+        }
+        return new Phi3DecoderLayerOutput(hiddenStates.MoveToOuterDisposeScope(), selfAttnWeights?.MoveToOuterDisposeScope(), presentKeyValue);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3MLP.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3MLP.cs
new file mode 100644
index 0000000000..745c000800
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3MLP.cs
@@ -0,0 +1,50 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+internal class Phi3MLP : torch.nn.Module<Tensor, Tensor>
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+{
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly QuantizedLinear gate_up_proj;
+    private readonly QuantizedLinear down_proj;
+    private readonly torch.nn.Module<Tensor, Tensor> activation_fn;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi3MLP(Phi3Config config)
+        : this(config.HiddenSize, config.IntermediateSize, config.HiddenAct, config.DType)
+    {
+    }
+
+    public Phi3MLP(int hiddenSize, int intermediateSize, string hiddenAct, ScalarType dtype)
+        : base(nameof(Phi3MLP))
+    {
+        this.gate_up_proj = new QuantizedLinear(hiddenSize, 2 * intermediateSize, hasBias: false, dtype: dtype);
+        this.down_proj = new QuantizedLinear(intermediateSize, hiddenSize, hasBias: false, dtype: dtype);
+        this.RegisterComponents();
+        this.activation_fn = Utils.GetActivation(hiddenAct);
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override Tensor forward(Tensor input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        using var input1 = this.gate_up_proj.forward(input);
+        var chunks = input1.chunk(2, dim: -1);
+        var gate = chunks[0];
+        var upStatus = chunks[1];
+        upStatus = upStatus * this.activation_fn.forward(gate);
+        return this.down_proj.forward(upStatus);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
new file mode 100644
index 0000000000..9f9f0a17ab
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3Model.cs
@@ -0,0 +1,134 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.GenAI.Core;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+
+internal class Phi3Model : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+{
+    private readonly Phi3Config _config;
+    private readonly int _paddingIdx;
+    private readonly int _vocabSize;
+    private IKVCache _cache;
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly Embedding embed_tokens;
+    private readonly Dropout embed_dropout;
+    private readonly ModuleList<Phi3DecoderLayer> layers;
+    private readonly Phi3RMSNorm norm;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi3Model(Phi3Config config)
+        : base(nameof(Phi3Model))
+    {
+        this._config = config;
+        this._paddingIdx = config.PadTokenId ?? 32000;
+        this._vocabSize = config.VocabSize;
+
+        this.embed_tokens = nn.Embedding(config.VocabSize, config.HiddenSize, padding_idx: this._paddingIdx, dtype: config.DType);
+        this.embed_dropout = nn.Dropout(config.EmbdPdrop);
+        this.layers = new ModuleList<Phi3DecoderLayer>();
+
+        for (int i = 0; i < config.NumHiddenLayers; i++)
+        {
+            this.layers.Add(new Phi3DecoderLayer(config, i));
+        }
+        this.norm = new Phi3RMSNorm(config.HiddenSize, config.RmsNormEps, config.DType);
+        this._cache = new DynamicKVCache();
+        this.RegisterComponents();
+    }
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override CasualLMModelOutput forward(CasualLMModelInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        if (input.OverrideCache is not null)
+        {
+            this._cache = input.OverrideCache;
+        }
+
+        var outputAttentions = input.OutputAttentions;
+        var outputHiddenStates = input.OutputHiddenStates;
+        var attentionMask = input.AttentionMask;
+        Device device;
+        var inputIds = input.InputIds;
+        var positionIds = input.PositionIds;
+        var inputsEmbeds = input.InputEmbeddings;
+        int batchSize;
+        int seqLength;
+        if (inputIds is not null && inputsEmbeds is not null)
+        {
+            throw new ArgumentException("Only one of input_ids or inputs_embeds may be set");
+        }
+        else if (inputIds is not null)
+        {
+            batchSize = inputIds.IntShape()[0];
+            seqLength = inputIds.IntShape()[1];
+            inputsEmbeds = this.embed_tokens.forward(inputIds);
+            device = inputIds.device;
+        }
+        else if (inputsEmbeds is not null)
+        {
+            batchSize = inputsEmbeds.IntShape()[0];
+            seqLength = inputsEmbeds.IntShape()[1];
+            device = inputsEmbeds.device;
+        }
+        else
+        {
+            throw new ArgumentException("Either input_ids or inputs_embeds must be set");
+        }
+
+        var pastKeyValuesLength = input.PastKeyValuesLength;
+
+        if (positionIds is null)
+        {
+            positionIds = torch.arange(pastKeyValuesLength, seqLength + pastKeyValuesLength, device: device);
+            positionIds = positionIds.unsqueeze(0).view(-1, seqLength);
+        }
+        else
+        {
+            positionIds = ((long)positionIds.view(-1, seqLength));
+        }
+
+        if (this._config.AttnImplementation == "flash_attention_2")
+        {
+            throw new NotImplementedException();
+        }
+        else
+        {
+            attentionMask = AttentionMaskConverter.Create4DCausalAttentionMask(attentionMask, [batchSize, seqLength], inputsEmbeds.dtype, device, pastKeyValuesLength, this._config.SlidingWindow);
+        }
+
+        var hiddenStates = inputsEmbeds;
+
+        var allHiddenStates = new List<Tensor>();
+        var allAttentions = new List<Tensor>();
+
+        foreach (var layer in this.layers)
+        {
+            if (outputHiddenStates)
+            {
+                allHiddenStates.Add(hiddenStates);
+            }
+
+            var decoderInput = new Phi3DecoderLayerInput(hiddenStates, attentionMask!, positionIds, this._cache, outputAttentions);
+            var layerOutput = layer.forward(decoderInput);
+            hiddenStates = layerOutput.HiddenStates;
+            if (outputAttentions && layerOutput.Attentions is not null)
+            {
+                allAttentions.Add(layerOutput.Attentions);
+            }
+        }
+
+        hiddenStates = this.norm.forward(hiddenStates);
+        if (outputHiddenStates)
+        {
+            allHiddenStates.Add(hiddenStates);
+        }
+
+        return new CasualLMModelOutput(lastHiddenState: hiddenStates, allHiddenStates: allHiddenStates.ToArray(), attentions: allAttentions.ToArray(), cache: this._cache);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3RMSNorm.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3RMSNorm.cs
new file mode 100644
index 0000000000..e8c847268e
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3RMSNorm.cs
@@ -0,0 +1,59 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+internal class Phi3RMSNorm : torch.nn.Module<Tensor, Tensor>
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+{
+    private readonly int _dim;
+    private readonly float _eps;
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly Parameter weight;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi3RMSNorm(
+        int hiddenSize,
+        float eps = 1e-6f,
+        ScalarType dtype = ScalarType.Float32)
+        : base(nameof(Phi3RMSNorm))
+    {
+        this._dim = hiddenSize;
+        this._eps = eps;
+
+        // the gamma scalar
+        this.weight = torch.nn.Parameter(torch.ones(this._dim, dtype: dtype));
+    }
+
+    private Tensor Norm(Tensor x)
+    {
+        // (B, Seq_Len, Dim) * (B, Seq_Len, 1) = (B, Seq_Len, Dim)
+        // rsqrt = 1 / sqrt
+        var output = x * torch.rsqrt(x.pow(2).mean([-1L], keepdim: true) + this._eps);
+        return output;
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override Tensor forward(Tensor input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        // needs higher precision for the norm so convert to float32
+        // (B, Seq_Len, Dim)
+        var normed = this.Norm(input.to_type(ScalarType.Float32)).type_as(input);
+        // (B, Seq_Len, Dim) * (Dim) = (B, Seq_Len, Dim)
+        var output = this.weight * normed;
+
+        return output;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3RotaryEmbedding.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3RotaryEmbedding.cs
new file mode 100644
index 0000000000..9b04a301d6
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3RotaryEmbedding.cs
@@ -0,0 +1,81 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+internal class Phi3RotaryEmbeddingInput
+{
+    public Phi3RotaryEmbeddingInput(Tensor input, Tensor positionIds, int? seqLen = null)
+    {
+        Input = input;
+        PositionIds = positionIds;
+        SeqLen = seqLen;
+    }
+
+    public Tensor Input { get; set; }
+
+    public Tensor PositionIds { get; set; }
+
+    public int? SeqLen { get; set; }
+}
+
+internal class Phi3RotaryEmbeddingOutput
+{
+    public Phi3RotaryEmbeddingOutput(Tensor cos, Tensor sin)
+    {
+        Cos = cos;
+        Sin = sin;
+    }
+
+    public Tensor Cos { get; set; }
+
+    public Tensor Sin { get; set; }
+}
+
+
+internal class Phi3RotaryEmbedding : nn.Module<
+    Phi3RotaryEmbeddingInput,
+    Phi3RotaryEmbeddingOutput>
+{
+    private readonly double _base;
+    private readonly int _maxPositionEmbeddings;
+    private readonly int _dim;
+
+    public Phi3RotaryEmbedding(double baseValue, int maxPositionEmbeddings, int dim)
+        : base(nameof(Phi3RotaryEmbedding))
+    {
+        _base = baseValue;
+        _maxPositionEmbeddings = maxPositionEmbeddings;
+        _dim = dim;
+        var thetaNumerator = torch.arange(0, _dim, 2, dtype: ScalarType.Int64).to(torch.float32);
+        this.register_buffer("inv_freq", torch.pow(baseValue, -1.0f * (thetaNumerator / dim)), persistent: false);
+    }
+
+    public int Dim => _dim;
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override Phi3RotaryEmbeddingOutput forward(Phi3RotaryEmbeddingInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        var x = input.Input;
+        var positionIds = input.PositionIds;
+        var seqLen = input.SeqLen;
+        // TODO
+        // can be calculated once and cached
+        var invFreq = this.get_buffer("inv_freq").to(x.device);
+        var invFreqExpanded = invFreq.unsqueeze(0).unsqueeze(-1);
+        invFreqExpanded = invFreqExpanded.expand(new long[] { positionIds.shape[0], -1, 1 });
+        var positionIdsExpanded = positionIds.unsqueeze(1).to(torch.float32);
+        var freqs = invFreqExpanded * positionIdsExpanded;
+        freqs = freqs.transpose(1, 2);
+        var emb = torch.cat([freqs, freqs], dim: -1);
+
+        var cos = torch.cos(emb);
+        var sin = torch.sin(emb);
+
+        return new(cos.to_type(x.dtype), sin.to_type(x.dtype));
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Module/Phi3SuScaledRotaryEmbedding.cs b/src/Microsoft.ML.GenAI.Phi/Module/Phi3SuScaledRotaryEmbedding.cs
new file mode 100644
index 0000000000..ce0e70b686
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Module/Phi3SuScaledRotaryEmbedding.cs
@@ -0,0 +1,79 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.Json;
+using System.Threading.Tasks;
+using TorchSharp;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi.Module;
+
+internal class Phi3SuScaledRotaryEmbedding : Phi3RotaryEmbedding
+{
+    private readonly double[] _shortFactor;
+    private readonly double[] _longFactor;
+    private readonly int _originalMaxPositionEmbeddings;
+    private readonly int _maxPositionEmbeddings;
+    private readonly double _base;
+
+    public Phi3SuScaledRotaryEmbedding(int dim, Phi3Config config)
+        : base(config.RopeTheta, config.MaxPositionEmbeddings, dim)
+    {
+        JsonElement shortFactorElement = (JsonElement)config.RopeScaling!["short_factor"];
+        JsonElement longFactorDocument = (JsonElement)config.RopeScaling!["long_factor"];
+        this._shortFactor = shortFactorElement.EnumerateArray().Select(e => e.GetDouble()).ToArray();
+        this._longFactor = longFactorDocument.EnumerateArray().Select(e => e.GetDouble()).ToArray();
+
+        this._originalMaxPositionEmbeddings = config.OriginalMaxPositionEmbeddings;
+        this._maxPositionEmbeddings = config.MaxPositionEmbeddings;
+        this._base = config.RopeTheta;
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override Phi3RotaryEmbeddingOutput forward(Phi3RotaryEmbeddingInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        var seqLen = (torch.max(input.PositionIds) + 1).ToInt32();
+        var x = input.Input;
+        Tensor extFactors;
+        if (seqLen > this._originalMaxPositionEmbeddings)
+        {
+            extFactors = torch.tensor(this._longFactor, dtype: ScalarType.Float32, x.device);
+        }
+        else
+        {
+            extFactors = torch.tensor(this._shortFactor, dtype: ScalarType.Float32, x.device);
+        }
+        var invFreqShape = torch.arange(0, this.Dim, 2, dtype: ScalarType.Int64).to(torch.float32) / this.Dim;
+        invFreqShape = invFreqShape.to(x.device);
+        var invFreq = 1.0f / (torch.pow(this._base, invFreqShape) * extFactors);
+
+        var invFreqExpanded = invFreq.unsqueeze(0).unsqueeze(-1);
+        invFreqExpanded = invFreqExpanded.expand(new long[] { input.PositionIds.shape[0], -1, 1 });
+        var positionIdsExpanded = input.PositionIds.unsqueeze(1).to(torch.float32);
+
+        var freqs = invFreqExpanded * positionIdsExpanded;
+        freqs = freqs.transpose(1, 2);
+        var emb = torch.cat([freqs, freqs], dim: -1);
+        var scale = (1.0 * this._maxPositionEmbeddings) / this._originalMaxPositionEmbeddings;
+        double scalingFactor;
+        if (scale <= 1)
+        {
+            scalingFactor = 1.0;
+        }
+        else
+        {
+            scalingFactor = Math.Sqrt(1 + Math.Log(scale) / Math.Log(this._originalMaxPositionEmbeddings));
+        }
+
+        var cos = torch.cos(emb) * scalingFactor;
+        var sin = torch.sin(emb) * scalingFactor;
+
+        return new(cos.to_type(x.dtype), sin.to_type(x.dtype));
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi2/Config.cs b/src/Microsoft.ML.GenAI.Phi/Phi2/Config.cs
new file mode 100644
index 0000000000..fdba74ba77
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Phi2/Config.cs
@@ -0,0 +1,118 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.Json;
+using System.Text.Json.Serialization;
+using System.Threading.Tasks;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi;
+
+public class Phi2Config
+{
+    public Phi2Config()
+    {
+        this.VocabSize = 51200;
+        this.HiddenSize = 2048;
+        this.IntermediateSize = 8192;
+        this.NumHiddenLayers = 24;
+        this.NumAttentionHeads = 32;
+        this.ResidPdrop = 0.0;
+        this.EmbdPdrop = 0.0;
+        this.AttentionDropout = 0.0;
+        this.HiddenAct = "gelu_new";
+        this.MaxPositionEmbeddings = 2048;
+        this.InitializerRange = 0.02;
+        this.LayerNormEps = 1e-5;
+        this.UseCache = true;
+        this.TieWordEmbeddings = false;
+        this.RopeTheta = 10000.0;
+        this.PartialRotaryFactor = 0.5;
+        this.QkLayernorm = false;
+        this.BosTokenId = 1;
+        this.EosTokenId = 2;
+        this.Dtype = ScalarType.Float32;
+    }
+
+    static Phi2Config()
+    {
+        var phi2ConfigContent = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.Phi.Resource.Config.phi-2-config.json");
+        var phi2Config = JsonSerializer.Deserialize<Phi2Config>(phi2ConfigContent) ?? throw new ArgumentNullException(nameof(phi2ConfigContent));
+        Phi2 = phi2Config;
+    }
+
+    /// <summary>
+    /// The default phi-2 configuration created from https://huggingface.co/microsoft/phi-2/blob/main/config.json.
+    /// </summary>
+    public static Phi2Config Phi2 { get; }
+
+    [JsonPropertyName("vocab_size")]
+    public int VocabSize { get; set; }
+
+    [JsonPropertyName("hidden_size")]
+    public int HiddenSize { get; set; }
+
+    [JsonPropertyName("intermediate_size")]
+    public int IntermediateSize { get; set; }
+
+    [JsonPropertyName("num_hidden_layers")]
+    public int NumHiddenLayers { get; set; }
+
+    [JsonPropertyName("num_attention_heads")]
+    public int NumAttentionHeads { get; set; }
+
+    [JsonPropertyName("num_key_value_heads")]
+    public int? NumKeyValueHeads { get; set; }
+
+    [JsonPropertyName("resid_pdrop")]
+    public double ResidPdrop { get; set; }
+
+    [JsonPropertyName("embd_pdrop")]
+    public double EmbdPdrop { get; set; }
+
+    [JsonPropertyName("attention_dropout")]
+    public double AttentionDropout { get; set; }
+
+    [JsonPropertyName("hidden_act")]
+    public string HiddenAct { get; set; }
+
+    [JsonPropertyName("max_position_embeddings")]
+    public int MaxPositionEmbeddings { get; set; }
+
+    [JsonPropertyName("initializer_range")]
+    public double InitializerRange { get; set; }
+
+    [JsonPropertyName("layer_norm_eps")]
+    public double LayerNormEps { get; set; }
+
+    [JsonPropertyName("use_cache")]
+    public bool UseCache { get; set; }
+
+    [JsonPropertyName("tie_word_embeddings")]
+    public bool TieWordEmbeddings { get; set; }
+
+    [JsonPropertyName("rope_theta")]
+    public double RopeTheta { get; set; }
+
+    // [JsonPropertyName("rope_scaling")]
+    // public double? RopeScaling { get; set; } = null;
+
+    [JsonPropertyName("partial_rotary_factor")]
+    public double PartialRotaryFactor { get; set; }
+
+    [JsonPropertyName("qk_layernorm")]
+    public bool QkLayernorm { get; set; }
+
+    [JsonPropertyName("bos_token_id")]
+    public int BosTokenId { get; set; }
+
+    [JsonPropertyName("eos_token_id")]
+    public int EosTokenId { get; set; }
+
+    public ScalarType Dtype { get; set; }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi2/Phi2ForCasualLM.cs b/src/Microsoft.ML.GenAI.Phi/Phi2/Phi2ForCasualLM.cs
new file mode 100644
index 0000000000..efb3f23de9
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Phi2/Phi2ForCasualLM.cs
@@ -0,0 +1,68 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.CodeDom;
+using System.Text.Json;
+using System.Text.Json.Serialization;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Phi.Module;
+using TorchSharp;
+using TorchSharp.Modules;
+using TorchSharp.PyBridge;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi;
+
+public class Phi2ForCasualLM : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+{
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly Phi2Model model;
+    private readonly GenAILinear lm_head;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi2ForCasualLM(Phi2Config config)
+        : base(nameof(Phi2ForCasualLM))
+    {
+        this.model = new Phi2Model(config);
+        this.lm_head = new GenAILinear(config.HiddenSize, config.VocabSize, dtype: config.Dtype);
+        this.RegisterComponents();
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override CasualLMModelOutput forward(CasualLMModelInput input) // use_cache, output_attentions, output_hidden_states
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        var inputIds = input.InputIds;
+        var attentionMask = input.AttentionMask;
+        var pastKeyValueLength = input.PastKeyValuesLength;
+        var positionIds = input.PositionIds;
+        var inputEmbeddings = input.InputEmbeddings;
+        var options = (input.OutputAttentions, input.OutputHiddenStates, false);
+        var output = this.model.forward(inputIds, attentionMask, pastKeyValueLength, positionIds, inputEmbeddings, options);
+        var hiddenState = output.Item1;
+
+        var lmLogits = this.lm_head.forward(hiddenState);
+
+        return new CasualLMModelOutput(lastHiddenState: hiddenState, logits: lmLogits);
+    }
+
+    public static Phi2ForCasualLM FromPretrained(
+        string modelFolder,
+        string configName = "config.json",
+        string checkPointName = "model.safetensors.index.json",
+        ScalarType torchDtype = ScalarType.Float32,
+        bool useTqdm = false,
+        string? device = null)
+    {
+        var config = Path.Join(modelFolder, configName);
+        var modelConfig = JsonSerializer.Deserialize<Phi2Config>(File.ReadAllText(config)) ?? throw new ArgumentNullException(nameof(config));
+        modelConfig.Dtype = torchDtype;
+        var wrapper = new Phi2ForCasualLM(modelConfig);
+        var loadedParameters = new Dictionary<string, bool>();
+        wrapper.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: true, loadedParameters: loadedParameters, useTqdm: useTqdm);
+        wrapper = wrapper.to(device);
+        wrapper.eval();
+        return wrapper;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi2/Phi2TokenizerHelper.cs b/src/Microsoft.ML.GenAI.Phi/Phi2/Phi2TokenizerHelper.cs
new file mode 100644
index 0000000000..3444c74e31
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Phi2/Phi2TokenizerHelper.cs
@@ -0,0 +1,30 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Text;
+using System.Text.RegularExpressions;
+using Microsoft.ML.Tokenizers;
+using Tensorboard;
+
+/// <summary>
+/// The utility class to create tokenizer for phi-3 model.
+/// </summary>
+public class Phi2TokenizerHelper
+{
+    public static CodeGenTokenizer Create(
+        string folder,
+        string vocabFile = "vocab.json",
+        string mergesFile = "merges.txt",
+        bool addPrefixSpace = false,
+        bool addBeginOfSentence = false,
+        bool addEndOfSentence = false)
+    {
+        var vocabPath = Path.Combine(folder, vocabFile);
+        var mergesPath = Path.Combine(folder, mergesFile);
+        using var vocabStream = File.OpenRead(vocabPath);
+        using var mergesStream = File.OpenRead(mergesPath);
+
+        return CodeGenTokenizer.Create(vocabStream, mergesStream, addPrefixSpace, addBeginOfSentence, addEndOfSentence);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Config.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Config.cs
new file mode 100644
index 0000000000..def5ab3448
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Config.cs
@@ -0,0 +1,151 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.Json;
+using System.Text.Json.Serialization;
+using System.Threading.Tasks;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi;
+public class Phi3Config
+{
+    public Phi3Config()
+    {
+        this.VocabSize = 32064;
+        this.HiddenSize = 3072;
+        this.RmsNormEps = 1e-5f;
+        this.IntermediateSize = 8192;
+        this.NumHiddenLayers = 32;
+        this.NumAttentionHeads = 32;
+        this.ResidPdrop = 0.0;
+        this.EmbdPdrop = 0.0;
+        this.AttentionDropout = 0.0;
+        this.HiddenAct = "silu";
+        this.MaxPositionEmbeddings = 4096;
+        this.OriginalMaxPositionEmbeddings = 4096;
+        this.InitializerRange = 0.02;
+        this.UseCache = true;
+        this.TieWordEmbeddings = false;
+        this.RopeTheta = 10000.0;
+        this.PartialRotaryFactor = 0.5;
+        this.QkLayernorm = false;
+        this.BosTokenId = 1;
+        this.EosTokenId = 32000;
+        this.DType = ScalarType.BFloat16;
+        this.AttnImplementation = "eager";
+    }
+
+    static Phi3Config()
+    {
+        var phi3Mini4kInstructContent = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.Phi.Resource.Config.phi-3-mini-4k-instruct-config.json");
+        var phi3Mini128kInstructContent = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.Phi.Resource.Config.phi-3-mini-128k-instruct-config.json");
+        var phi3Medium4kInstructContent = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.Phi.Resource.Config.phi-3-medium-4k-instruct-config.json");
+        var phi3Medium128kInstructContent = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.Phi.Resource.Config.phi-3-medium-128k-instruct-config.json");
+
+        Phi3Mini4kInstruct = JsonSerializer.Deserialize<Phi3Config>(phi3Mini4kInstructContent) ?? throw new ArgumentNullException(nameof(phi3Mini4kInstructContent));
+        Phi3Mini128kInstruct = JsonSerializer.Deserialize<Phi3Config>(phi3Mini128kInstructContent) ?? throw new ArgumentNullException(nameof(phi3Mini128kInstructContent));
+        Phi3Medium4kInstruct = JsonSerializer.Deserialize<Phi3Config>(phi3Medium4kInstructContent) ?? throw new ArgumentNullException(nameof(phi3Medium4kInstructContent));
+        Phi3Medium128kInstruct = JsonSerializer.Deserialize<Phi3Config>(phi3Medium128kInstructContent) ?? throw new ArgumentNullException(nameof(phi3Medium128kInstructContent));
+    }
+
+    /// <summary>
+    /// The phi-3-mini-4k-instruct configuration created from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/config.json.
+    /// </summary>
+    public static Phi3Config Phi3Mini4kInstruct { get; }
+
+    /// <summary>
+    /// The phi-3-medium-4k-instruct configuration created from https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/blob/main/config.json.
+    /// </summary>
+    public static Phi3Config Phi3Medium4kInstruct { get; }
+
+    /// <summary>
+    /// The phi-3-medium-128k-instruct configuration created from https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/blob/main/config.json.
+    /// </summary>
+    public static Phi3Config Phi3Medium128kInstruct { get; }
+
+    /// <summary>
+    /// The phi-3-mini-128k-instruct configuration created from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/config.json.
+    /// </summary>
+    public static Phi3Config Phi3Mini128kInstruct { get; }
+
+    [JsonPropertyName("vocab_size")]
+    public int VocabSize { get; set; }
+
+    [JsonPropertyName("hidden_size")]
+    public int HiddenSize { get; set; }
+
+    [JsonPropertyName("rms_norm_eps")]
+    public float RmsNormEps { get; set; }
+
+    [JsonPropertyName("intermediate_size")]
+    public int IntermediateSize { get; set; }
+
+    [JsonPropertyName("num_hidden_layers")]
+    public int NumHiddenLayers { get; set; }
+
+    [JsonPropertyName("num_attention_heads")]
+    public int NumAttentionHeads { get; set; }
+
+    [JsonPropertyName("num_key_value_heads")]
+    public int? NumKeyValueHeads { get; set; }
+
+    [JsonPropertyName("resid_pdrop")]
+    public double ResidPdrop { get; set; }
+
+    [JsonPropertyName("embd_pdrop")]
+    public double EmbdPdrop { get; set; }
+
+    [JsonPropertyName("attention_dropout")]
+    public double AttentionDropout { get; set; }
+
+    [JsonPropertyName("hidden_act")]
+    public string HiddenAct { get; set; }
+
+    [JsonPropertyName("max_position_embeddings")]
+    public int MaxPositionEmbeddings { get; set; }
+
+    [JsonPropertyName("original_max_position_embeddings")]
+    public int OriginalMaxPositionEmbeddings { get; set; }
+
+    [JsonPropertyName("initializer_range")]
+    public double InitializerRange { get; set; }
+
+    [JsonPropertyName("use_cache")]
+    public bool UseCache { get; set; }
+
+    [JsonPropertyName("tie_word_embeddings")]
+    public bool TieWordEmbeddings { get; set; }
+
+    [JsonPropertyName("rope_theta")]
+    public double RopeTheta { get; set; }
+
+    [JsonPropertyName("rope_scaling")]
+    public Dictionary<string, object>? RopeScaling { get; set; }
+
+    [JsonPropertyName("partial_rotary_factor")]
+    public double PartialRotaryFactor { get; set; }
+
+    [JsonPropertyName("qk_layernorm")]
+    public bool QkLayernorm { get; set; }
+
+    [JsonPropertyName("bos_token_id")]
+    public int BosTokenId { get; set; }
+
+    [JsonPropertyName("eos_token_id")]
+    public int EosTokenId { get; set; }
+
+    [JsonPropertyName("pad_token_id")]
+    public int? PadTokenId { get; set; }
+
+    [JsonPropertyName("sliding_window")]
+    public int? SlidingWindow { get; set; }
+
+    public ScalarType DType { get; set; }
+
+    public string AttnImplementation { get; set; }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMAgent.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMAgent.cs
new file mode 100644
index 0000000000..abe1e92716
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMAgent.cs
@@ -0,0 +1,114 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.CompilerServices;
+using System.Text;
+using System.Threading.Tasks;
+using AutoGen.Core;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.Tokenizers;
+
+namespace Microsoft.ML.GenAI.Phi;
+
+public class Phi3Agent : IStreamingAgent
+{
+    private const char Newline = '\n';
+    private readonly ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> _pipeline;
+    private readonly string? _systemMessage;
+
+    public Phi3Agent(
+        ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline,
+        string name,
+        string? systemMessage = "you are a helpful assistant")
+    {
+        this.Name = name;
+        this._pipeline = pipeline;
+        this._systemMessage = systemMessage;
+    }
+
+    public string Name { get; }
+
+    public Task<IMessage> GenerateReplyAsync(IEnumerable<IMessage> messages, GenerateReplyOptions? options = null, CancellationToken cancellationToken = default)
+    {
+        var input = BuildPrompt(messages);
+        var maxLen = options?.MaxToken ?? 1024;
+        var temperature = options?.Temperature ?? 0.7f;
+        var stopTokenSequence = options?.StopSequence ?? [];
+        stopTokenSequence = stopTokenSequence.Append("<|end|>").ToArray();
+
+        var output = _pipeline.Generate(
+            input,
+            maxLen: maxLen,
+            temperature: temperature,
+            stopSequences: stopTokenSequence) ?? throw new InvalidOperationException("Failed to generate a reply.");
+
+        return Task.FromResult<IMessage>(new TextMessage(Role.Assistant, output, from: this.Name));
+    }
+
+#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously
+    public async IAsyncEnumerable<IStreamingMessage> GenerateStreamingReplyAsync(
+#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously
+        IEnumerable<IMessage> messages,
+        GenerateReplyOptions? options = null,
+        [EnumeratorCancellation] CancellationToken cancellationToken = default)
+    {
+        var input = BuildPrompt(messages);
+        var maxLen = options?.MaxToken ?? 1024;
+        var temperature = options?.Temperature ?? 0.7f;
+        var stopTokenSequence = options?.StopSequence ?? [];
+        stopTokenSequence = stopTokenSequence.Append("<|end|>").ToArray();
+
+        foreach (var output in _pipeline.GenerateStreaming(
+            input,
+            maxLen: maxLen,
+            temperature: temperature,
+            stopSequences: stopTokenSequence))
+        {
+            yield return new TextMessageUpdate(Role.Assistant, output, from: this.Name);
+        }
+    }
+
+    private string BuildPrompt(IEnumerable<IMessage> messages)
+    {
+        var availableRoles = new[] { Role.System, Role.User, Role.Assistant };
+        if (messages.Any(m => m.GetContent() is null))
+        {
+            throw new InvalidOperationException("Please provide a message with content.");
+        }
+
+        if (messages.Any(m => m.GetRole() is null || availableRoles.Contains(m.GetRole()!.Value) == false))
+        {
+            throw new InvalidOperationException("Please provide a message with a valid role. The valid roles are System, User, and Assistant.");
+        }
+
+        // construct template based on instruction from
+        // https://huggingface.co/microsoft/Phi-3-mini-128k-instruct#chat-format
+
+        var sb = new StringBuilder();
+        if (_systemMessage is not null)
+        {
+            sb.Append($"<|system|>{Newline}{_systemMessage}<|end|>{Newline}");
+        }
+        foreach (var message in messages)
+        {
+            var role = message.GetRole()!.Value;
+            var content = message.GetContent()!;
+            sb.Append(message switch
+            {
+                _ when message.GetRole() == Role.System => $"<|system|>{Newline}{content}<|end|>{Newline}",
+                _ when message.GetRole() == Role.User => $"<|user|>{Newline}{content}<|end|>{Newline}",
+                _ when message.GetRole() == Role.Assistant => $"<|assistant|>{Newline}{content}<|end|>{Newline}",
+                _ => throw new InvalidOperationException("Invalid role.")
+            });
+        }
+
+        sb.Append("<|assistant|>");
+        var input = sb.ToString();
+
+        return input;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMChatCompletionService.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMChatCompletionService.cs
new file mode 100644
index 0000000000..efe3089fdb
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMChatCompletionService.cs
@@ -0,0 +1,85 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+using System.Text;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.Tokenizers;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.ChatCompletion;
+using Microsoft.SemanticKernel.TextGeneration;
+
+namespace Microsoft.ML.GenAI.Phi;
+
+public class Phi3CausalLMChatCompletionService : IChatCompletionService
+{
+    private readonly ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> _pipeline;
+    private readonly Phi3CausalLMTextGenerationService _textGenerationService;
+    private const char NewLine = '\n'; // has to be \n, \r\n will cause wanky result.
+
+    public Phi3CausalLMChatCompletionService(ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline)
+    {
+        _pipeline = pipeline;
+        _textGenerationService = new Phi3CausalLMTextGenerationService(pipeline);
+    }
+
+    public IReadOnlyDictionary<string, object?> Attributes => _textGenerationService.Attributes;
+
+    public async Task<IReadOnlyList<ChatMessageContent>> GetChatMessageContentsAsync(
+        ChatHistory chatHistory,
+        PromptExecutionSettings? executionSettings = null,
+        Kernel? kernel = null,
+        CancellationToken cancellationToken = default)
+    {
+        var prompt = BuildPrompt(chatHistory);
+        var reply = await _textGenerationService.GetTextContentAsync(prompt, executionSettings, kernel, cancellationToken);
+        return [new ChatMessageContent(AuthorRole.Assistant, reply.Text)];
+    }
+
+    public async IAsyncEnumerable<StreamingChatMessageContent> GetStreamingChatMessageContentsAsync(
+        ChatHistory chatHistory,
+        PromptExecutionSettings? executionSettings = null,
+        Kernel? kernel = null,
+        [EnumeratorCancellation]
+        CancellationToken cancellationToken = default)
+    {
+        var prompt = BuildPrompt(chatHistory);
+
+        await foreach (var reply in _textGenerationService.GetStreamingTextContentsAsync(prompt, executionSettings, kernel, cancellationToken))
+        {
+            yield return new StreamingChatMessageContent(AuthorRole.Assistant, reply.Text);
+        }
+    }
+
+    private string BuildPrompt(ChatHistory chatHistory)
+    {
+        // build prompt from chat history
+        var sb = new StringBuilder();
+
+        foreach (var message in chatHistory)
+        {
+            foreach (var item in message.Items)
+            {
+                if (item is not TextContent textContent)
+                {
+                    throw new NotSupportedException($"Only text content is supported, but got {item.GetType().Name}");
+                }
+
+                var prompt = message.Role switch
+                {
+                    _ when message.Role == AuthorRole.System => $"<|system|>{NewLine}{textContent}<|end|>{NewLine}",
+                    _ when message.Role == AuthorRole.User => $"<|user|>{NewLine}{textContent}<|end|>{NewLine}",
+                    _ when message.Role == AuthorRole.Assistant => $"<|assistant|>{NewLine}{textContent}<|end|>{NewLine}",
+                    _ => throw new NotSupportedException($"Unsupported role {message.Role}")
+                };
+
+                sb.Append(prompt);
+            }
+        }
+
+        sb.Append("<|assistant|>");
+
+        return sb.ToString();
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMTextGenerationService.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMTextGenerationService.cs
new file mode 100644
index 0000000000..ac22b4f353
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3CausalLMTextGenerationService.cs
@@ -0,0 +1,73 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Runtime.CompilerServices;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.Tokenizers;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.TextGeneration;
+
+namespace Microsoft.ML.GenAI.Phi;
+
+public class Phi3CausalLMTextGenerationService : ITextGenerationService
+{
+    private readonly ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> _pipeline;
+
+    public Phi3CausalLMTextGenerationService(ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> pipeline)
+    {
+        _pipeline = pipeline;
+    }
+
+    public IReadOnlyDictionary<string, object?> Attributes => new Dictionary<string, object?>()
+    {
+        { "temperature", null },
+        { "max_token", null },
+        { "stop_token_sequence", null },
+        { "top_p", null },
+    };
+
+    public Task<IReadOnlyList<TextContent>> GetTextContentsAsync(string prompt, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, CancellationToken cancellationToken = default)
+    {
+        var temperature = executionSettings?.ExtensionData?["temperature"] as float? ?? 0.7f;
+        var maxToken = executionSettings?.ExtensionData?["max_token"] as int? ?? 512;
+        var stopTokenSequence = executionSettings?.ExtensionData?["stop_token_sequence"] as List<string> ?? new List<string>();
+        var topP = executionSettings?.ExtensionData?["top_p"] as float? ?? 0.9f;
+        stopTokenSequence.Add("<|end|>");
+        var response = _pipeline.Generate(
+            prompt,
+            maxToken,
+            temperature,
+            stopSequences: stopTokenSequence.ToArray(),
+            topP: topP);
+
+        return Task.FromResult<IReadOnlyList<TextContent>>([new TextContent(response)]);
+    }
+
+#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously
+    public async IAsyncEnumerable<StreamingTextContent> GetStreamingTextContentsAsync(
+#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously
+        string prompt,
+        PromptExecutionSettings?
+        executionSettings = null,
+        Kernel? kernel = null,
+        [EnumeratorCancellation]
+        CancellationToken cancellationToken = default)
+    {
+        var temperature = executionSettings?.ExtensionData?["temperature"] as float? ?? 0.7f;
+        var maxToken = executionSettings?.ExtensionData?["max_token"] as int? ?? 100;
+        var stopTokenSequence = executionSettings?.ExtensionData?["stop_token_sequence"] as string[] ?? Array.Empty<string>();
+        var topP = executionSettings?.ExtensionData?["top_p"] as float? ?? 0.9f;
+        stopTokenSequence.Append("<|end|>");
+
+        foreach (var item in _pipeline.GenerateStreaming(
+            prompt,
+            maxToken,
+            temperature,
+            topP,
+            stopTokenSequence))
+        {
+            yield return new StreamingTextContent(item);
+        }
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs
new file mode 100644
index 0000000000..41b2d970fd
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs
@@ -0,0 +1,73 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.Json;
+using System.Threading.Tasks;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Phi.Module;
+using TorchSharp;
+using TorchSharp.Modules;
+using TorchSharp.PyBridge;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Phi;
+
+public class Phi3ForCasualLM : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+{
+    private readonly Phi3Config _config;
+
+#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
+    private readonly Phi3Model model;
+    private readonly GenAILinear lm_head;
+#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
+
+    public Phi3ForCasualLM(Phi3Config config)
+        : base(nameof(Phi3ForCasualLM))
+    {
+        this._config = config;
+        this.model = new Phi3Model(config);
+        this.lm_head = new GenAILinear(config.HiddenSize, config.VocabSize, dtype: config.DType, hasBias: false);
+
+        this.RegisterComponents();
+    }
+
+#pragma warning disable MSML_GeneralName // This name should be PascalCased
+    public override CasualLMModelOutput forward(CasualLMModelInput input)
+#pragma warning restore MSML_GeneralName // This name should be PascalCased
+    {
+        var outputs = this.model.forward(input);
+        var logits = this.lm_head.forward(outputs.LastHiddenState);
+        logits = logits.to_type(ScalarType.Float32);
+        outputs.Logits = logits;
+
+        return outputs;
+    }
+
+    public static Phi3ForCasualLM FromPretrained(
+        string modelFolder,
+        string configName = "config.json",
+        string checkPointName = "model.safetensors.index.json",
+        ScalarType torchDtype = ScalarType.BFloat16,
+        string device = "cpu")
+    {
+        var config = Path.Join(modelFolder, configName);
+        var modelConfig = JsonSerializer.Deserialize<Phi3Config>(File.ReadAllText(config)) ?? throw new ArgumentNullException(nameof(config));
+        modelConfig.DType = torchDtype;
+        var phi = new Phi3ForCasualLM(modelConfig);
+        phi.LoadSafeTensors(modelFolder, checkPointName);
+        phi = phi.to(device);
+        phi.eval();
+
+        return phi;
+    }
+
+    public void LoadSafeTensors(string modelFolder, string checkPointName = "model.safetensors.index.json")
+    {
+        this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: false, useTqdm: false);
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3TokenizerHelper.cs b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3TokenizerHelper.cs
new file mode 100644
index 0000000000..dd54378892
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3TokenizerHelper.cs
@@ -0,0 +1,50 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Text;
+using System.Text.RegularExpressions;
+using Microsoft.ML.Tokenizers;
+
+/// <summary>
+/// The utility class to create tokenizer for phi-3 model.
+/// </summary>
+public class Phi3TokenizerHelper
+{
+    private const string SystemSymbol = "<|system|>";
+    private const string UserSymbol = "<|user|>";
+    private const string AssistantSymbol = "<|assistant|>";
+    private const string EndSymbol = "<|end|>";
+    private const int SystemSymbolId = 32006;
+    private const int UserSymbolId = 32010;
+    private const int AssistantSymbolId = 32001;
+    private const int EndSymbolId = 32007;
+
+    public static LlamaTokenizer FromPretrained(
+        string modelPath,
+        string systemSymbol = SystemSymbol,
+        string userSymbol = UserSymbol,
+        string assistantSymbol = AssistantSymbol,
+        string endSymbol = EndSymbol,
+        int systemSymbolId = SystemSymbolId,
+        int userSymbolId = UserSymbolId,
+        int assistantSymbolId = AssistantSymbolId,
+        int endSymbolId = EndSymbolId,
+        bool addPrecedingSpace = true)
+    {
+        var modelStream = File.OpenRead(modelPath);
+
+        var llamaTokenizer = LlamaTokenizer.Create(
+            modelStream,
+            addPrecedingSpace,
+            specialTokens: new Dictionary<string, int>
+            {
+                { systemSymbol, systemSymbolId },
+                { userSymbol, userSymbolId },
+                { assistantSymbol, assistantSymbolId },
+                { endSymbol, endSymbolId }
+            });
+
+        return llamaTokenizer;
+    }
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-2-config.json b/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-2-config.json
new file mode 100644
index 0000000000..c3a5b1ce17
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-2-config.json
@@ -0,0 +1,31 @@
+{
+    "_name_or_path": "microsoft/phi-2",
+    "architectures": [
+      "PhiForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bos_token_id": 50256,
+    "embd_pdrop": 0.0,
+    "eos_token_id": 50256,
+    "hidden_act": "gelu_new",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 10240,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 2048,
+    "model_type": "phi",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 32,
+    "partial_rotary_factor": 0.4,
+    "qk_layernorm": false,
+    "resid_pdrop": 0.1,
+    "rope_scaling": null,
+    "rope_theta": 10000.0,
+    "tie_word_embeddings": false,
+    "torch_dtype": "float16",
+    "transformers_version": "4.37.0",
+    "use_cache": true,
+    "vocab_size": 51200
+  }
+  
\ No newline at end of file
diff --git a/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-medium-128k-instruct-config.json b/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-medium-128k-instruct-config.json
new file mode 100644
index 0000000000..f058b81505
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-medium-128k-instruct-config.json
@@ -0,0 +1,170 @@
+{
+  "_name_or_path": "Phi-3-medium-128k-instruct",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
+  },
+  "bos_token_id": 1,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 17920,
+  "max_position_embeddings": 131072,
+  "model_type": "phi3",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 10,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": null,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "long_factor": [
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.25,
+      1.25,
+      1.5,
+      2.0,
+      2.75,
+      5.75,
+      5.75,
+      6.5,
+      9.25,
+      11.0,
+      13.25,
+      19.25,
+      19.75,
+      19.75,
+      21.25,
+      21.5,
+      26.5,
+      30.0,
+      33.75,
+      35.25,
+      38.5,
+      42.0,
+      42.25,
+      46.0,
+      47.0,
+      50.0,
+      50.5,
+      51.0,
+      52.0,
+      52.75,
+      53.75,
+      54.75,
+      57.0,
+      57.25,
+      58.5,
+      59.25,
+      59.5,
+      62.0,
+      62.5,
+      62.75,
+      63.25,
+      63.25,
+      63.25,
+      63.75,
+      64.0,
+      64.0,
+      64.25,
+      64.5,
+      64.5,
+      65.0,
+      65.0
+    ],
+    "short_factor": [
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.01,
+      1.02,
+      1.02,
+      1.04,
+      1.04,
+      1.07,
+      1.07,
+      1.1,
+      1.3000000000000003,
+      1.3000000000000003,
+      1.5000000000000004,
+      1.5700000000000005,
+      1.9000000000000008,
+      2.3100000000000014,
+      2.759999999999992,
+      3.3899999999999784,
+      3.9399999999999666,
+      4.009999999999965,
+      4.289999999999959,
+      4.349999999999958,
+      5.349999999999937,
+      6.659999999999909,
+      7.029999999999901,
+      7.51999999999989,
+      8.00999999999988,
+      8.249999999999876,
+      8.279999999999875,
+      9.629999999999846,
+      9.89999999999984,
+      10.589999999999826,
+      11.049999999999816,
+      11.7899999999998,
+      12.189999999999792,
+      12.889999999999777,
+      13.129999999999772,
+      13.16999999999977,
+      13.20999999999977,
+      13.479999999999764,
+      13.539999999999763,
+      13.779999999999758,
+      13.929999999999755,
+      14.429999999999744,
+      14.759999999999737,
+      15.149999999999729,
+      15.419999999999723,
+      15.53999999999972,
+      15.659999999999718,
+      15.749999999999716,
+      15.759999999999716,
+      15.799999999999715,
+      16.05999999999971,
+      16.079999999999714,
+      16.11999999999972,
+      16.11999999999972,
+      16.18999999999973,
+      16.31999999999975,
+      16.539999999999786,
+      16.799999999999827
+    ],
+    "type": "su"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.39.3",
+  "use_cache": true,
+  "attention_bias": false,
+  "vocab_size": 32064
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-medium-4k-instruct-config.json b/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-medium-4k-instruct-config.json
new file mode 100644
index 0000000000..4ded05bfd8
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-medium-4k-instruct-config.json
@@ -0,0 +1,36 @@
+{
+  "_name_or_path": "Phi-3-medium-4k-instruct",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
+  },
+  "bos_token_id": 1,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 17920,
+  "max_position_embeddings": 4096,
+  "model_type": "phi3",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 10,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": 32000,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "sliding_window": 2047,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.39.3",
+  "use_cache": true,
+  "attention_bias": false,
+  "vocab_size": 32064
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-mini-128k-instruct-config.json b/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-mini-128k-instruct-config.json
new file mode 100644
index 0000000000..9aae2a82fc
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-mini-128k-instruct-config.json
@@ -0,0 +1,140 @@
+{
+  "_name_or_path": "Phi-3-mini-128k-instruct",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_phi3.Phi3ForSequenceClassification",
+    "AutoModelForTokenClassification": "modeling_phi3.Phi3ForTokenClassification"
+  },
+  "bos_token_id": 1,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "model_type": "phi3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": 32000,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "long_factor": [
+      1.0299999713897705,
+      1.0499999523162842,
+      1.0499999523162842,
+      1.0799999237060547,
+      1.2299998998641968,
+      1.2299998998641968,
+      1.2999999523162842,
+      1.4499999284744263,
+      1.5999999046325684,
+      1.6499998569488525,
+      1.8999998569488525,
+      2.859999895095825,
+      3.68999981880188,
+      5.419999599456787,
+      5.489999771118164,
+      5.489999771118164,
+      9.09000015258789,
+      11.579999923706055,
+      15.65999984741211,
+      15.769999504089355,
+      15.789999961853027,
+      18.360000610351562,
+      21.989999771118164,
+      23.079999923706055,
+      30.009998321533203,
+      32.35000228881836,
+      32.590003967285156,
+      35.56000518798828,
+      39.95000457763672,
+      53.840003967285156,
+      56.20000457763672,
+      57.95000457763672,
+      59.29000473022461,
+      59.77000427246094,
+      59.920005798339844,
+      61.190006256103516,
+      61.96000671386719,
+      62.50000762939453,
+      63.3700065612793,
+      63.48000717163086,
+      63.48000717163086,
+      63.66000747680664,
+      63.850006103515625,
+      64.08000946044922,
+      64.760009765625,
+      64.80001068115234,
+      64.81001281738281,
+      64.81001281738281
+    ],
+    "short_factor": [
+      1.05,
+      1.05,
+      1.05,
+      1.1,
+      1.1,
+      1.1500000000000001,
+      1.2000000000000002,
+      1.2500000000000002,
+      1.3000000000000003,
+      1.3500000000000003,
+      1.5000000000000004,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.000000000000001,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.0500000000000007,
+      2.1000000000000005,
+      2.1000000000000005,
+      2.1000000000000005,
+      2.1500000000000004,
+      2.1500000000000004,
+      2.3499999999999996,
+      2.549999999999999,
+      2.5999999999999988,
+      2.5999999999999988,
+      2.7499999999999982,
+      2.849999999999998,
+      2.849999999999998,
+      2.9499999999999975
+    ],
+    "type": "su"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 262144,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.39.3",
+  "use_cache": true,
+  "attention_bias": false,
+  "vocab_size": 32064
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-mini-4k-instruct-config.json b/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-mini-4k-instruct-config.json
new file mode 100644
index 0000000000..92bb5b42eb
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Resource/Config/phi-3-mini-4k-instruct-config.json
@@ -0,0 +1,36 @@
+{
+  "_name_or_path": "Phi-3-mini-4k-instruct",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
+  },
+  "bos_token_id": 1,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 4096,
+  "model_type": "phi3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": 32000,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "sliding_window": 2047,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.39.3",
+  "use_cache": true,
+  "attention_bias": false,
+  "vocab_size": 32064
+}
diff --git a/src/Microsoft.ML.GenAI.Phi/Resource/README.md b/src/Microsoft.ML.GenAI.Phi/Resource/README.md
new file mode 100644
index 0000000000..35f35bda1a
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Resource/README.md
@@ -0,0 +1,8 @@
+## This folder includes the embedded resources for the GenAI.Phi project.
+
+### Configuration
+- [phi-2-config.json](https://huggingface.co/microsoft/phi-2/blob/main/config.json): The phi-2 model configuration file.
+- [phi-3-mini-128k-config.json](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/config.json): The phi-3-mini-128k model configuration file.
+- [phi-3-mini-4k-config.json](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/config.json): The phi-3-mini-4k model configuration file.
+- [phi-3-medium-4k-config.json](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/blob/main/config.json): The phi-3-medium-4k model configuration file.
+- [phi-3-medium-128k-config.json](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/blob/main/config.json): The phi-3-medium-128k model configuration file.]
\ No newline at end of file
diff --git a/src/Microsoft.ML.GenAI.Phi/Utils.cs b/src/Microsoft.ML.GenAI.Phi/Utils.cs
new file mode 100644
index 0000000000..4591d94f14
--- /dev/null
+++ b/src/Microsoft.ML.GenAI.Phi/Utils.cs
@@ -0,0 +1,179 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Reflection;
+using System.Text;
+using System.Threading.Tasks;
+using TorchSharp;
+using TorchSharp.Modules;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+namespace Microsoft.ML.GenAI.Phi;
+
+internal static class Utils
+{
+    public static string GetEmbeddedResource(string resourceName)
+    {
+        // read file content from embedded resource
+        var assembly = Assembly.GetExecutingAssembly();
+        var resourceStream = assembly.GetManifestResourceStream(resourceName);
+
+        if (resourceStream == null)
+        {
+            throw new ArgumentException("Resource not found", nameof(resourceName));
+        }
+
+        using var reader = new System.IO.StreamReader(resourceStream);
+        return reader.ReadToEnd();
+    }
+
+    public static Tensor ApplyRotaryEmbeddings(Tensor input, Tensor freqsComplex)
+    {
+        // Separate the last dimension pairs of two values, representing the real and imaginary parts of the complex number
+        // Two consecutive values will become a single complex number
+        // (B, Seq_Len, H, Head_Dim) -> (B, Seq_Len, H, Head_Dim/2)
+        var inputComplex = input.to_type(ScalarType.Float32).reshape(input.shape[0], input.shape[1], input.shape[2], -1, 2).view_as_complex();
+        freqsComplex = freqsComplex.to(input.device);
+
+        // Reshape the freqs_complex tensor to match the shape of the x_complex tensor. So we need to add the batch dimension and the head dimension
+        // (Seq_Len, Head_Dim/2) --> (1, Seq_Len, 1, Head_Dim/2)
+        var freqsComplexReshaped = freqsComplex.unsqueeze(0).unsqueeze(2);
+
+        // Multiply each complex number in the x_complex tensor by the corresponding complex number in the freqs_complex tensor
+        // Which results in the rotation of the complex number as shown in the Figure 1 of the paper
+        // (B, Seq_Len, H, Head_Dim/2) * (1, Seq_Len, 1, Head_Dim/2) = (B, Seq_Len, H, Head_Dim/2)
+        var rotatedComplex = inputComplex * freqsComplexReshaped;
+        // Console.WriteLine(rotated_complex.mean().ToSingle());
+
+        // Convert the complex number back to the real number
+        // (B, Seq_Len, H, Head_Dim/2) -> (B, Seq_Len, H, Head_Dim/2, 2)
+        var rotated = rotatedComplex.view_as_real();
+
+        // (B, Seq_Len, H, Head_Dim/2, 2) -> (B, Seq_Len, H, Head_Dim)
+        var rotatedReshaped = rotated.reshape(rotated.shape[0], rotated.shape[1], rotated.shape[2], -1);
+
+        return rotatedReshaped.type_as(input);
+    }
+
+    public static Tensor PrecomputeThetaPosFrequencies(int headDim, int seqLen, string device, float theta = 10000.0f)
+    {
+        // As written in the paragraph 3.2.2 of the paper
+        // >> In order to generalize our results in 2D to any xi ∈ Rd where **d is even**, [...]
+        if (headDim % 2 != 0)
+        {
+            throw new ArgumentException("Dimension must be divisible by 2", nameof(headDim));
+        }
+
+        // Build the theta parameter
+        // According to the formula theta_i = 10000^(-2(i-1)/dim) for i = [1, 2, ... dim/2]
+        // Shape: (Head_Dim / 2)
+        var thetaNumerator = torch.arange(0, headDim, 2).to(torch.float32).to(device);
+        // Shape: (Head_Dim / 2)
+        var thetaInput = torch.pow(theta, -1.0f * (thetaNumerator / headDim)).to(device); // (Dim / 2)
+        // Construct the positions (the "m" parameter)
+        // Shape: (Seq_Len)
+        var m = torch.arange(seqLen, device: device);
+        // Multiply each theta by each position using the outer product.
+        // Shape: (Seq_Len) outer_product* (Head_Dim / 2) -> (Seq_Len, Head_Dim / 2)
+        var freqs = torch.outer(m, thetaInput).to(torch.float32).to(device);
+
+        // We can compute complex numbers in the polar form c = R * exp(m * theta), where R = 1 as follows:
+        // (Seq_Len, Head_Dim / 2) -> (Seq_Len, Head_Dim / 2)
+        var freqsComplex = torch.polar(torch.ones_like(freqs), freqs);
+
+        return freqsComplex;
+    }
+
+    // python
+    // def rotate_half(x):
+    // """Rotates half the hidden dims of the input."""
+    // x1 = x[..., : x.shape[-1] // 2]
+    // x2 = x[..., x.shape[-1] // 2 :]
+    // return torch.cat((-x2, x1), dim=-1)
+    public static Tensor RotateHalf(Tensor x)
+    {
+        var x1 = x[.., .., .., ..(int)(x.shape[^1] / 2)];
+        var x2 = x[.., .., .., (int)(x.shape[^1] / 2)..];
+        // (x1 * x1 * x2).Peek("x1 * x1 * x2");
+        return torch.cat([-x2, x1], dim: -1);
+    }
+
+    public static (Tensor, Tensor) ApplyRotaryPosEmb(Tensor q, Tensor k, Tensor cos, Tensor sin, Tensor? positionIds = null, int unsqueezeDim = 1)
+    {
+        // The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+        // sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+        // that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+        // k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+        // cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+        // the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+
+        if (positionIds is not null)
+        {
+            cos = cos[positionIds!].unsqueeze(unsqueezeDim);
+            sin = sin[positionIds!].unsqueeze(unsqueezeDim);
+        }
+        else
+        {
+            cos = cos.unsqueeze(unsqueezeDim);
+            sin = sin.unsqueeze(unsqueezeDim);
+        }
+        var qEmbed = q * cos;
+        qEmbed += RotateHalf(q) * sin;
+
+        var kEmbed = k * cos;
+        kEmbed += RotateHalf(k) * sin;
+        // var kEmbed = (k * cos) + (RotateHalf(k) * sin);
+        return (qEmbed, kEmbed);
+    }
+
+    public static Module<Tensor, Tensor> GetActivation(string actFn)
+    {
+        return actFn switch
+        {
+            "silu" => nn.SiLU(),
+            "relu" => nn.ReLU(),
+            "gelu" => nn.GELU(),
+            "tanh" => nn.Tanh(),
+            "swish" => nn.SiLU(),
+            _ => throw new ArgumentException("Invalid activation function", actFn),
+        };
+    }
+
+
+    public static Tensor Phi2RepeatKV(Tensor x, int nRep)
+    {
+        var batchSize = x.shape[0];
+        var seqLen = x.shape[1];
+        var nKVHeads = x.shape[2];
+        var headDim = x.shape[3];
+        if (nRep == 1)
+        {
+            return x;
+        }
+
+        return x.unsqueeze(3)
+                .expand(batchSize, seqLen, nKVHeads, nRep, headDim)
+                .view(batchSize, seqLen, nKVHeads * nRep, headDim);
+    }
+
+    public static Tensor Phi3RepeatKV(Tensor x, int nRep)
+    {
+        var batchSize = x.shape[0];
+        var nKVHeads = x.shape[1];
+        var seqLen = x.shape[2];
+        var headDim = x.shape[3];
+        if (nRep == 1)
+        {
+            return x;
+        }
+
+        return x.unsqueeze(3)
+                .expand(batchSize, nKVHeads, nRep, seqLen, headDim)
+                .view(batchSize, nKVHeads * nRep, seqLen, headDim);
+    }
+
+}
diff --git a/test/Microsoft.ML.GenAI.Core.Tests/Microsoft.ML.GenAI.Core.Tests.csproj b/test/Microsoft.ML.GenAI.Core.Tests/Microsoft.ML.GenAI.Core.Tests.csproj
new file mode 100644
index 0000000000..9960611dcf
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Core.Tests/Microsoft.ML.GenAI.Core.Tests.csproj
@@ -0,0 +1,31 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFrameworks>net6.0</TargetFrameworks>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <NoWarn>$(NoWarn);MSML_ExtendBaseTestClass</NoWarn>
+    <Nullable>enable</Nullable>
+    <PreserveCompilationContext>true</PreserveCompilationContext>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\src\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <PackageReference Include="ApprovalTests" Version="$(ApprovalTestsVersion)" />
+    <PackageReference Include="FluentAssertions" Version="$(FluentAssertionVersion)" />
+    <PackageReference Include="Moq" Version="$(MoqVersion)" />
+    <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
+  </ItemGroup>
+
+  <ItemGroup Condition="'$(TargetArchitecture)' != 'x64'">
+    <Compile Remove="QuantizedLinearTests.cs" />
+  </ItemGroup>
+
+  <ItemGroup Condition="'$(TargetArchitecture)' == 'x64'">
+    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows')) AND '$(TargetArchitecture)' == 'x64'" />
+    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux')) AND '$(TargetArchitecture)' == 'x64'" />
+    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX')) AND '$(TargetArchitecture)' == 'x64'" />
+  </ItemGroup>
+</Project>
diff --git a/test/Microsoft.ML.GenAI.Core.Tests/QuantizedLinearTests.cs b/test/Microsoft.ML.GenAI.Core.Tests/QuantizedLinearTests.cs
new file mode 100644
index 0000000000..d1653721ba
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Core.Tests/QuantizedLinearTests.cs
@@ -0,0 +1,127 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using FluentAssertions;
+using Microsoft.ML.GenAI.Core.Extension;
+using TorchSharp;
+using Xunit;
+using static TorchSharp.torch;
+
+namespace Microsoft.ML.GenAI.Core.Tests;
+
+public class QuantizedLinearTests
+{
+    [Fact]
+    public void Int4QuantizeSizeTests()
+    {
+        // meta is critical for the test
+        // as the size of the model to test is 372 GB
+        // and can't be loaded in real device like cpu or cuda
+        var device = "meta";
+        var model = new QuantizedLinear(100000, 100, device: device);
+
+        var sizeInBytes = model.GetSizeInBytes();
+
+        var sizeInGigaBytes = sizeInBytes / 1024 / 1024;
+        sizeInGigaBytes.Should().Be(38);
+
+        // to int4
+        model.Int4();
+        var sizeInBytesAfterInt8 = model.GetSizeInBytes();
+        var sizeInGigaBytesAfterInt8 = sizeInBytesAfterInt8 / 1024 / 1024;
+        sizeInGigaBytesAfterInt8.Should().Be(4); // 38 // 8 = 4
+    }
+
+    [Fact]
+    public void Int8QuantizeSizeTests()
+    {
+        // meta is critical for the test
+        // as the size of the model to test is 372 GB
+        // and can't be loaded in real device like cpu or cuda
+        var device = "meta";
+        var model = new QuantizedLinear(100000, 100, device: device);
+
+        var sizeInBytes = model.GetSizeInBytes();
+
+        var sizeInGigaBytes = sizeInBytes / 1024 / 1024;
+        sizeInGigaBytes.Should().Be(38);
+
+        // to int8
+        model.Int8();
+        var sizeInBytesAfterInt8 = model.GetSizeInBytes();
+        var sizeInGigaBytesAfterInt8 = sizeInBytesAfterInt8 / 1024 / 1024;
+        sizeInGigaBytesAfterInt8.Should().Be(9); // 38 // 4 = 9
+    }
+
+    [Fact]
+    public void Int4QuantizeForwardTest()
+    {
+        var device = "cpu";
+        var model = new QuantizedLinear(123, 10, device: device);
+
+        // set both weight and bias to rand int8 values
+        // and compare the result before and after ToInt8
+        var input = torch.ones([10, 2200, 123], device: device);
+        var weight = torch.ones([10, 123], device: device, dtype: ScalarType.Int64) * -1;
+        var bias = torch.ones([10], device: device) * 2;
+
+        var weightStr = weight.Peek("weight").ToString();
+
+        weight = (weight + 8).view(-1).to(torch.uint8);
+        var weightPlaceHolderDim = (int)weight.size(0);
+        weightPlaceHolderDim = weightPlaceHolderDim / 2 + weightPlaceHolderDim % 2;
+        var weightPlaceHolder = weight[..weightPlaceHolderDim];
+        weightPlaceHolder = weightPlaceHolder * 16 + weight[weightPlaceHolderDim..];
+
+        var high4Bit = weightPlaceHolder / 16;
+        var low4Bit = weightPlaceHolder % 16;
+        weight = torch.cat(new Tensor[] { high4Bit, low4Bit }).view(10, 123);
+        weight = weight.to(torch.int64);
+        weight -= 8;
+        weight.Peek("weight").Should().Be(weightStr);
+
+        model.load_state_dict(new Dictionary<string, Tensor>
+        {
+            ["weight"] = weight,
+            ["bias"] = bias
+        });
+
+        var resultBeforeInt4 = model.forward(input);
+
+        model.ToInt4QuantizeModule();
+
+        var resultAfterInt4 = model.forward(input);
+
+        // compare the result
+        resultBeforeInt4.Peek("result").Should().Be(resultAfterInt4.Peek("result"));
+    }
+
+    [Fact]
+    public void Int8QuantizeForwardTest()
+    {
+        var device = "cpu";
+        var model = new QuantizedLinear(123, 10, device: device);
+
+        // set both weight and bias to rand int8 values
+        // and compare the result before and after ToInt8
+        var input = torch.ones([10, 2200, 123], device: device);
+        var weight = torch.ones([10, 123], device: device) * -1;
+        var bias = torch.ones([10], device: device) * 2;
+
+        model.load_state_dict(new Dictionary<string, Tensor>
+        {
+            ["weight"] = weight,
+            ["bias"] = bias
+        });
+
+        var resultBeforeInt8 = model.forward(input);
+
+        model.ToInt8QuantizeModule();
+
+        var resultAfterInt8 = model.forward(input);
+
+        resultBeforeInt8.Peek("result").Should().Be("result: sum: 312.6933  dType: Float32 shape: [10,2200,10]");
+        resultAfterInt8.Peek("result").Should().Be("result: sum: 312.6933  dType: Float32 shape: [10,2200,10]");
+    }
+}
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Test.LoadSafeTensorShapeTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Test.LoadSafeTensorShapeTest.approved.txt
new file mode 100644
index 0000000000..75e17ad1a6
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Test.LoadSafeTensorShapeTest.approved.txt
@@ -0,0 +1,453 @@
+0: lm_head.bias shape: [51200]
+1: lm_head.weight shape: [51200, 2560]
+2: model.embed_tokens.weight shape: [51200, 2560]
+3: model.final_layernorm.bias shape: [2560]
+4: model.final_layernorm.weight shape: [2560]
+5: model.layers.0.input_layernorm.bias shape: [2560]
+6: model.layers.0.input_layernorm.weight shape: [2560]
+7: model.layers.0.mlp.fc1.bias shape: [10240]
+8: model.layers.0.mlp.fc1.weight shape: [10240, 2560]
+9: model.layers.0.mlp.fc2.bias shape: [2560]
+10: model.layers.0.mlp.fc2.weight shape: [2560, 10240]
+11: model.layers.0.self_attn.dense.bias shape: [2560]
+12: model.layers.0.self_attn.dense.weight shape: [2560, 2560]
+13: model.layers.0.self_attn.k_proj.bias shape: [2560]
+14: model.layers.0.self_attn.k_proj.weight shape: [2560, 2560]
+15: model.layers.0.self_attn.q_proj.bias shape: [2560]
+16: model.layers.0.self_attn.q_proj.weight shape: [2560, 2560]
+17: model.layers.0.self_attn.v_proj.bias shape: [2560]
+18: model.layers.0.self_attn.v_proj.weight shape: [2560, 2560]
+19: model.layers.1.input_layernorm.bias shape: [2560]
+20: model.layers.1.input_layernorm.weight shape: [2560]
+21: model.layers.1.mlp.fc1.bias shape: [10240]
+22: model.layers.1.mlp.fc1.weight shape: [10240, 2560]
+23: model.layers.1.mlp.fc2.bias shape: [2560]
+24: model.layers.1.mlp.fc2.weight shape: [2560, 10240]
+25: model.layers.1.self_attn.dense.bias shape: [2560]
+26: model.layers.1.self_attn.dense.weight shape: [2560, 2560]
+27: model.layers.1.self_attn.k_proj.bias shape: [2560]
+28: model.layers.1.self_attn.k_proj.weight shape: [2560, 2560]
+29: model.layers.1.self_attn.q_proj.bias shape: [2560]
+30: model.layers.1.self_attn.q_proj.weight shape: [2560, 2560]
+31: model.layers.1.self_attn.v_proj.bias shape: [2560]
+32: model.layers.1.self_attn.v_proj.weight shape: [2560, 2560]
+33: model.layers.10.input_layernorm.bias shape: [2560]
+34: model.layers.10.input_layernorm.weight shape: [2560]
+35: model.layers.10.mlp.fc1.bias shape: [10240]
+36: model.layers.10.mlp.fc1.weight shape: [10240, 2560]
+37: model.layers.10.mlp.fc2.bias shape: [2560]
+38: model.layers.10.mlp.fc2.weight shape: [2560, 10240]
+39: model.layers.10.self_attn.dense.bias shape: [2560]
+40: model.layers.10.self_attn.dense.weight shape: [2560, 2560]
+41: model.layers.10.self_attn.k_proj.bias shape: [2560]
+42: model.layers.10.self_attn.k_proj.weight shape: [2560, 2560]
+43: model.layers.10.self_attn.q_proj.bias shape: [2560]
+44: model.layers.10.self_attn.q_proj.weight shape: [2560, 2560]
+45: model.layers.10.self_attn.v_proj.bias shape: [2560]
+46: model.layers.10.self_attn.v_proj.weight shape: [2560, 2560]
+47: model.layers.11.input_layernorm.bias shape: [2560]
+48: model.layers.11.input_layernorm.weight shape: [2560]
+49: model.layers.11.mlp.fc1.bias shape: [10240]
+50: model.layers.11.mlp.fc1.weight shape: [10240, 2560]
+51: model.layers.11.mlp.fc2.bias shape: [2560]
+52: model.layers.11.mlp.fc2.weight shape: [2560, 10240]
+53: model.layers.11.self_attn.dense.bias shape: [2560]
+54: model.layers.11.self_attn.dense.weight shape: [2560, 2560]
+55: model.layers.11.self_attn.k_proj.bias shape: [2560]
+56: model.layers.11.self_attn.k_proj.weight shape: [2560, 2560]
+57: model.layers.11.self_attn.q_proj.bias shape: [2560]
+58: model.layers.11.self_attn.q_proj.weight shape: [2560, 2560]
+59: model.layers.11.self_attn.v_proj.bias shape: [2560]
+60: model.layers.11.self_attn.v_proj.weight shape: [2560, 2560]
+61: model.layers.12.input_layernorm.bias shape: [2560]
+62: model.layers.12.input_layernorm.weight shape: [2560]
+63: model.layers.12.mlp.fc1.bias shape: [10240]
+64: model.layers.12.mlp.fc1.weight shape: [10240, 2560]
+65: model.layers.12.mlp.fc2.bias shape: [2560]
+66: model.layers.12.mlp.fc2.weight shape: [2560, 10240]
+67: model.layers.12.self_attn.dense.bias shape: [2560]
+68: model.layers.12.self_attn.dense.weight shape: [2560, 2560]
+69: model.layers.12.self_attn.k_proj.bias shape: [2560]
+70: model.layers.12.self_attn.k_proj.weight shape: [2560, 2560]
+71: model.layers.12.self_attn.q_proj.bias shape: [2560]
+72: model.layers.12.self_attn.q_proj.weight shape: [2560, 2560]
+73: model.layers.12.self_attn.v_proj.bias shape: [2560]
+74: model.layers.12.self_attn.v_proj.weight shape: [2560, 2560]
+75: model.layers.13.input_layernorm.bias shape: [2560]
+76: model.layers.13.input_layernorm.weight shape: [2560]
+77: model.layers.13.mlp.fc1.bias shape: [10240]
+78: model.layers.13.mlp.fc1.weight shape: [10240, 2560]
+79: model.layers.13.mlp.fc2.bias shape: [2560]
+80: model.layers.13.mlp.fc2.weight shape: [2560, 10240]
+81: model.layers.13.self_attn.dense.bias shape: [2560]
+82: model.layers.13.self_attn.dense.weight shape: [2560, 2560]
+83: model.layers.13.self_attn.k_proj.bias shape: [2560]
+84: model.layers.13.self_attn.k_proj.weight shape: [2560, 2560]
+85: model.layers.13.self_attn.q_proj.bias shape: [2560]
+86: model.layers.13.self_attn.q_proj.weight shape: [2560, 2560]
+87: model.layers.13.self_attn.v_proj.bias shape: [2560]
+88: model.layers.13.self_attn.v_proj.weight shape: [2560, 2560]
+89: model.layers.14.input_layernorm.bias shape: [2560]
+90: model.layers.14.input_layernorm.weight shape: [2560]
+91: model.layers.14.mlp.fc1.bias shape: [10240]
+92: model.layers.14.mlp.fc1.weight shape: [10240, 2560]
+93: model.layers.14.mlp.fc2.bias shape: [2560]
+94: model.layers.14.mlp.fc2.weight shape: [2560, 10240]
+95: model.layers.14.self_attn.dense.bias shape: [2560]
+96: model.layers.14.self_attn.dense.weight shape: [2560, 2560]
+97: model.layers.14.self_attn.k_proj.bias shape: [2560]
+98: model.layers.14.self_attn.k_proj.weight shape: [2560, 2560]
+99: model.layers.14.self_attn.q_proj.bias shape: [2560]
+100: model.layers.14.self_attn.q_proj.weight shape: [2560, 2560]
+101: model.layers.14.self_attn.v_proj.bias shape: [2560]
+102: model.layers.14.self_attn.v_proj.weight shape: [2560, 2560]
+103: model.layers.15.input_layernorm.bias shape: [2560]
+104: model.layers.15.input_layernorm.weight shape: [2560]
+105: model.layers.15.mlp.fc1.bias shape: [10240]
+106: model.layers.15.mlp.fc1.weight shape: [10240, 2560]
+107: model.layers.15.mlp.fc2.bias shape: [2560]
+108: model.layers.15.mlp.fc2.weight shape: [2560, 10240]
+109: model.layers.15.self_attn.dense.bias shape: [2560]
+110: model.layers.15.self_attn.dense.weight shape: [2560, 2560]
+111: model.layers.15.self_attn.k_proj.bias shape: [2560]
+112: model.layers.15.self_attn.k_proj.weight shape: [2560, 2560]
+113: model.layers.15.self_attn.q_proj.bias shape: [2560]
+114: model.layers.15.self_attn.q_proj.weight shape: [2560, 2560]
+115: model.layers.15.self_attn.v_proj.bias shape: [2560]
+116: model.layers.15.self_attn.v_proj.weight shape: [2560, 2560]
+117: model.layers.16.input_layernorm.bias shape: [2560]
+118: model.layers.16.input_layernorm.weight shape: [2560]
+119: model.layers.16.mlp.fc1.bias shape: [10240]
+120: model.layers.16.mlp.fc1.weight shape: [10240, 2560]
+121: model.layers.16.mlp.fc2.bias shape: [2560]
+122: model.layers.16.mlp.fc2.weight shape: [2560, 10240]
+123: model.layers.16.self_attn.dense.bias shape: [2560]
+124: model.layers.16.self_attn.dense.weight shape: [2560, 2560]
+125: model.layers.16.self_attn.k_proj.bias shape: [2560]
+126: model.layers.16.self_attn.k_proj.weight shape: [2560, 2560]
+127: model.layers.16.self_attn.q_proj.bias shape: [2560]
+128: model.layers.16.self_attn.q_proj.weight shape: [2560, 2560]
+129: model.layers.16.self_attn.v_proj.bias shape: [2560]
+130: model.layers.16.self_attn.v_proj.weight shape: [2560, 2560]
+131: model.layers.17.input_layernorm.bias shape: [2560]
+132: model.layers.17.input_layernorm.weight shape: [2560]
+133: model.layers.17.mlp.fc1.bias shape: [10240]
+134: model.layers.17.mlp.fc1.weight shape: [10240, 2560]
+135: model.layers.17.mlp.fc2.bias shape: [2560]
+136: model.layers.17.mlp.fc2.weight shape: [2560, 10240]
+137: model.layers.17.self_attn.dense.bias shape: [2560]
+138: model.layers.17.self_attn.dense.weight shape: [2560, 2560]
+139: model.layers.17.self_attn.k_proj.bias shape: [2560]
+140: model.layers.17.self_attn.k_proj.weight shape: [2560, 2560]
+141: model.layers.17.self_attn.q_proj.bias shape: [2560]
+142: model.layers.17.self_attn.q_proj.weight shape: [2560, 2560]
+143: model.layers.17.self_attn.v_proj.bias shape: [2560]
+144: model.layers.17.self_attn.v_proj.weight shape: [2560, 2560]
+145: model.layers.18.input_layernorm.bias shape: [2560]
+146: model.layers.18.input_layernorm.weight shape: [2560]
+147: model.layers.18.mlp.fc1.bias shape: [10240]
+148: model.layers.18.mlp.fc1.weight shape: [10240, 2560]
+149: model.layers.18.mlp.fc2.bias shape: [2560]
+150: model.layers.18.mlp.fc2.weight shape: [2560, 10240]
+151: model.layers.18.self_attn.dense.bias shape: [2560]
+152: model.layers.18.self_attn.dense.weight shape: [2560, 2560]
+153: model.layers.18.self_attn.k_proj.bias shape: [2560]
+154: model.layers.18.self_attn.k_proj.weight shape: [2560, 2560]
+155: model.layers.18.self_attn.q_proj.bias shape: [2560]
+156: model.layers.18.self_attn.q_proj.weight shape: [2560, 2560]
+157: model.layers.18.self_attn.v_proj.bias shape: [2560]
+158: model.layers.18.self_attn.v_proj.weight shape: [2560, 2560]
+159: model.layers.19.input_layernorm.bias shape: [2560]
+160: model.layers.19.input_layernorm.weight shape: [2560]
+161: model.layers.19.mlp.fc1.bias shape: [10240]
+162: model.layers.19.mlp.fc1.weight shape: [10240, 2560]
+163: model.layers.19.mlp.fc2.bias shape: [2560]
+164: model.layers.19.mlp.fc2.weight shape: [2560, 10240]
+165: model.layers.19.self_attn.dense.bias shape: [2560]
+166: model.layers.19.self_attn.dense.weight shape: [2560, 2560]
+167: model.layers.19.self_attn.k_proj.bias shape: [2560]
+168: model.layers.19.self_attn.k_proj.weight shape: [2560, 2560]
+169: model.layers.19.self_attn.q_proj.bias shape: [2560]
+170: model.layers.19.self_attn.q_proj.weight shape: [2560, 2560]
+171: model.layers.19.self_attn.v_proj.bias shape: [2560]
+172: model.layers.19.self_attn.v_proj.weight shape: [2560, 2560]
+173: model.layers.2.input_layernorm.bias shape: [2560]
+174: model.layers.2.input_layernorm.weight shape: [2560]
+175: model.layers.2.mlp.fc1.bias shape: [10240]
+176: model.layers.2.mlp.fc1.weight shape: [10240, 2560]
+177: model.layers.2.mlp.fc2.bias shape: [2560]
+178: model.layers.2.mlp.fc2.weight shape: [2560, 10240]
+179: model.layers.2.self_attn.dense.bias shape: [2560]
+180: model.layers.2.self_attn.dense.weight shape: [2560, 2560]
+181: model.layers.2.self_attn.k_proj.bias shape: [2560]
+182: model.layers.2.self_attn.k_proj.weight shape: [2560, 2560]
+183: model.layers.2.self_attn.q_proj.bias shape: [2560]
+184: model.layers.2.self_attn.q_proj.weight shape: [2560, 2560]
+185: model.layers.2.self_attn.v_proj.bias shape: [2560]
+186: model.layers.2.self_attn.v_proj.weight shape: [2560, 2560]
+187: model.layers.20.input_layernorm.bias shape: [2560]
+188: model.layers.20.input_layernorm.weight shape: [2560]
+189: model.layers.20.mlp.fc1.bias shape: [10240]
+190: model.layers.20.mlp.fc1.weight shape: [10240, 2560]
+191: model.layers.20.mlp.fc2.bias shape: [2560]
+192: model.layers.20.mlp.fc2.weight shape: [2560, 10240]
+193: model.layers.20.self_attn.dense.bias shape: [2560]
+194: model.layers.20.self_attn.dense.weight shape: [2560, 2560]
+195: model.layers.20.self_attn.k_proj.bias shape: [2560]
+196: model.layers.20.self_attn.k_proj.weight shape: [2560, 2560]
+197: model.layers.20.self_attn.q_proj.bias shape: [2560]
+198: model.layers.20.self_attn.q_proj.weight shape: [2560, 2560]
+199: model.layers.20.self_attn.v_proj.bias shape: [2560]
+200: model.layers.20.self_attn.v_proj.weight shape: [2560, 2560]
+201: model.layers.21.input_layernorm.bias shape: [2560]
+202: model.layers.21.input_layernorm.weight shape: [2560]
+203: model.layers.21.mlp.fc1.bias shape: [10240]
+204: model.layers.21.mlp.fc1.weight shape: [10240, 2560]
+205: model.layers.21.mlp.fc2.bias shape: [2560]
+206: model.layers.21.mlp.fc2.weight shape: [2560, 10240]
+207: model.layers.21.self_attn.dense.bias shape: [2560]
+208: model.layers.21.self_attn.dense.weight shape: [2560, 2560]
+209: model.layers.21.self_attn.k_proj.bias shape: [2560]
+210: model.layers.21.self_attn.k_proj.weight shape: [2560, 2560]
+211: model.layers.21.self_attn.q_proj.bias shape: [2560]
+212: model.layers.21.self_attn.q_proj.weight shape: [2560, 2560]
+213: model.layers.21.self_attn.v_proj.bias shape: [2560]
+214: model.layers.21.self_attn.v_proj.weight shape: [2560, 2560]
+215: model.layers.22.input_layernorm.bias shape: [2560]
+216: model.layers.22.input_layernorm.weight shape: [2560]
+217: model.layers.22.mlp.fc1.bias shape: [10240]
+218: model.layers.22.mlp.fc1.weight shape: [10240, 2560]
+219: model.layers.22.mlp.fc2.bias shape: [2560]
+220: model.layers.22.mlp.fc2.weight shape: [2560, 10240]
+221: model.layers.22.self_attn.dense.bias shape: [2560]
+222: model.layers.22.self_attn.dense.weight shape: [2560, 2560]
+223: model.layers.22.self_attn.k_proj.bias shape: [2560]
+224: model.layers.22.self_attn.k_proj.weight shape: [2560, 2560]
+225: model.layers.22.self_attn.q_proj.bias shape: [2560]
+226: model.layers.22.self_attn.q_proj.weight shape: [2560, 2560]
+227: model.layers.22.self_attn.v_proj.bias shape: [2560]
+228: model.layers.22.self_attn.v_proj.weight shape: [2560, 2560]
+229: model.layers.23.input_layernorm.bias shape: [2560]
+230: model.layers.23.input_layernorm.weight shape: [2560]
+231: model.layers.23.mlp.fc1.bias shape: [10240]
+232: model.layers.23.mlp.fc1.weight shape: [10240, 2560]
+233: model.layers.23.mlp.fc2.bias shape: [2560]
+234: model.layers.23.mlp.fc2.weight shape: [2560, 10240]
+235: model.layers.23.self_attn.dense.bias shape: [2560]
+236: model.layers.23.self_attn.dense.weight shape: [2560, 2560]
+237: model.layers.23.self_attn.k_proj.bias shape: [2560]
+238: model.layers.23.self_attn.k_proj.weight shape: [2560, 2560]
+239: model.layers.23.self_attn.q_proj.bias shape: [2560]
+240: model.layers.23.self_attn.q_proj.weight shape: [2560, 2560]
+241: model.layers.23.self_attn.v_proj.bias shape: [2560]
+242: model.layers.23.self_attn.v_proj.weight shape: [2560, 2560]
+243: model.layers.24.input_layernorm.bias shape: [2560]
+244: model.layers.24.input_layernorm.weight shape: [2560]
+245: model.layers.24.mlp.fc1.bias shape: [10240]
+246: model.layers.24.mlp.fc1.weight shape: [10240, 2560]
+247: model.layers.24.mlp.fc2.bias shape: [2560]
+248: model.layers.24.mlp.fc2.weight shape: [2560, 10240]
+249: model.layers.24.self_attn.dense.bias shape: [2560]
+250: model.layers.24.self_attn.dense.weight shape: [2560, 2560]
+251: model.layers.24.self_attn.k_proj.bias shape: [2560]
+252: model.layers.24.self_attn.k_proj.weight shape: [2560, 2560]
+253: model.layers.24.self_attn.q_proj.bias shape: [2560]
+254: model.layers.24.self_attn.q_proj.weight shape: [2560, 2560]
+255: model.layers.24.self_attn.v_proj.bias shape: [2560]
+256: model.layers.24.self_attn.v_proj.weight shape: [2560, 2560]
+257: model.layers.25.input_layernorm.bias shape: [2560]
+258: model.layers.25.input_layernorm.weight shape: [2560]
+259: model.layers.25.mlp.fc1.bias shape: [10240]
+260: model.layers.25.mlp.fc1.weight shape: [10240, 2560]
+261: model.layers.25.mlp.fc2.bias shape: [2560]
+262: model.layers.25.mlp.fc2.weight shape: [2560, 10240]
+263: model.layers.25.self_attn.dense.bias shape: [2560]
+264: model.layers.25.self_attn.dense.weight shape: [2560, 2560]
+265: model.layers.25.self_attn.k_proj.bias shape: [2560]
+266: model.layers.25.self_attn.k_proj.weight shape: [2560, 2560]
+267: model.layers.25.self_attn.q_proj.bias shape: [2560]
+268: model.layers.25.self_attn.q_proj.weight shape: [2560, 2560]
+269: model.layers.25.self_attn.v_proj.bias shape: [2560]
+270: model.layers.25.self_attn.v_proj.weight shape: [2560, 2560]
+271: model.layers.26.input_layernorm.bias shape: [2560]
+272: model.layers.26.input_layernorm.weight shape: [2560]
+273: model.layers.26.mlp.fc1.bias shape: [10240]
+274: model.layers.26.mlp.fc1.weight shape: [10240, 2560]
+275: model.layers.26.mlp.fc2.bias shape: [2560]
+276: model.layers.26.mlp.fc2.weight shape: [2560, 10240]
+277: model.layers.26.self_attn.dense.bias shape: [2560]
+278: model.layers.26.self_attn.dense.weight shape: [2560, 2560]
+279: model.layers.26.self_attn.k_proj.bias shape: [2560]
+280: model.layers.26.self_attn.k_proj.weight shape: [2560, 2560]
+281: model.layers.26.self_attn.q_proj.bias shape: [2560]
+282: model.layers.26.self_attn.q_proj.weight shape: [2560, 2560]
+283: model.layers.26.self_attn.v_proj.bias shape: [2560]
+284: model.layers.26.self_attn.v_proj.weight shape: [2560, 2560]
+285: model.layers.27.input_layernorm.bias shape: [2560]
+286: model.layers.27.input_layernorm.weight shape: [2560]
+287: model.layers.27.mlp.fc1.bias shape: [10240]
+288: model.layers.27.mlp.fc1.weight shape: [10240, 2560]
+289: model.layers.27.mlp.fc2.bias shape: [2560]
+290: model.layers.27.mlp.fc2.weight shape: [2560, 10240]
+291: model.layers.27.self_attn.dense.bias shape: [2560]
+292: model.layers.27.self_attn.dense.weight shape: [2560, 2560]
+293: model.layers.27.self_attn.k_proj.bias shape: [2560]
+294: model.layers.27.self_attn.k_proj.weight shape: [2560, 2560]
+295: model.layers.27.self_attn.q_proj.bias shape: [2560]
+296: model.layers.27.self_attn.q_proj.weight shape: [2560, 2560]
+297: model.layers.27.self_attn.v_proj.bias shape: [2560]
+298: model.layers.27.self_attn.v_proj.weight shape: [2560, 2560]
+299: model.layers.28.input_layernorm.bias shape: [2560]
+300: model.layers.28.input_layernorm.weight shape: [2560]
+301: model.layers.28.mlp.fc1.bias shape: [10240]
+302: model.layers.28.mlp.fc1.weight shape: [10240, 2560]
+303: model.layers.28.mlp.fc2.bias shape: [2560]
+304: model.layers.28.mlp.fc2.weight shape: [2560, 10240]
+305: model.layers.28.self_attn.dense.bias shape: [2560]
+306: model.layers.28.self_attn.dense.weight shape: [2560, 2560]
+307: model.layers.28.self_attn.k_proj.bias shape: [2560]
+308: model.layers.28.self_attn.k_proj.weight shape: [2560, 2560]
+309: model.layers.28.self_attn.q_proj.bias shape: [2560]
+310: model.layers.28.self_attn.q_proj.weight shape: [2560, 2560]
+311: model.layers.28.self_attn.v_proj.bias shape: [2560]
+312: model.layers.28.self_attn.v_proj.weight shape: [2560, 2560]
+313: model.layers.29.input_layernorm.bias shape: [2560]
+314: model.layers.29.input_layernorm.weight shape: [2560]
+315: model.layers.29.mlp.fc1.bias shape: [10240]
+316: model.layers.29.mlp.fc1.weight shape: [10240, 2560]
+317: model.layers.29.mlp.fc2.bias shape: [2560]
+318: model.layers.29.mlp.fc2.weight shape: [2560, 10240]
+319: model.layers.29.self_attn.dense.bias shape: [2560]
+320: model.layers.29.self_attn.dense.weight shape: [2560, 2560]
+321: model.layers.29.self_attn.k_proj.bias shape: [2560]
+322: model.layers.29.self_attn.k_proj.weight shape: [2560, 2560]
+323: model.layers.29.self_attn.q_proj.bias shape: [2560]
+324: model.layers.29.self_attn.q_proj.weight shape: [2560, 2560]
+325: model.layers.29.self_attn.v_proj.bias shape: [2560]
+326: model.layers.29.self_attn.v_proj.weight shape: [2560, 2560]
+327: model.layers.3.input_layernorm.bias shape: [2560]
+328: model.layers.3.input_layernorm.weight shape: [2560]
+329: model.layers.3.mlp.fc1.bias shape: [10240]
+330: model.layers.3.mlp.fc1.weight shape: [10240, 2560]
+331: model.layers.3.mlp.fc2.bias shape: [2560]
+332: model.layers.3.mlp.fc2.weight shape: [2560, 10240]
+333: model.layers.3.self_attn.dense.bias shape: [2560]
+334: model.layers.3.self_attn.dense.weight shape: [2560, 2560]
+335: model.layers.3.self_attn.k_proj.bias shape: [2560]
+336: model.layers.3.self_attn.k_proj.weight shape: [2560, 2560]
+337: model.layers.3.self_attn.q_proj.bias shape: [2560]
+338: model.layers.3.self_attn.q_proj.weight shape: [2560, 2560]
+339: model.layers.3.self_attn.v_proj.bias shape: [2560]
+340: model.layers.3.self_attn.v_proj.weight shape: [2560, 2560]
+341: model.layers.30.input_layernorm.bias shape: [2560]
+342: model.layers.30.input_layernorm.weight shape: [2560]
+343: model.layers.30.mlp.fc1.bias shape: [10240]
+344: model.layers.30.mlp.fc1.weight shape: [10240, 2560]
+345: model.layers.30.mlp.fc2.bias shape: [2560]
+346: model.layers.30.mlp.fc2.weight shape: [2560, 10240]
+347: model.layers.30.self_attn.dense.bias shape: [2560]
+348: model.layers.30.self_attn.dense.weight shape: [2560, 2560]
+349: model.layers.30.self_attn.k_proj.bias shape: [2560]
+350: model.layers.30.self_attn.k_proj.weight shape: [2560, 2560]
+351: model.layers.30.self_attn.q_proj.bias shape: [2560]
+352: model.layers.30.self_attn.q_proj.weight shape: [2560, 2560]
+353: model.layers.30.self_attn.v_proj.bias shape: [2560]
+354: model.layers.30.self_attn.v_proj.weight shape: [2560, 2560]
+355: model.layers.31.input_layernorm.bias shape: [2560]
+356: model.layers.31.input_layernorm.weight shape: [2560]
+357: model.layers.31.mlp.fc1.bias shape: [10240]
+358: model.layers.31.mlp.fc1.weight shape: [10240, 2560]
+359: model.layers.31.mlp.fc2.bias shape: [2560]
+360: model.layers.31.mlp.fc2.weight shape: [2560, 10240]
+361: model.layers.31.self_attn.dense.bias shape: [2560]
+362: model.layers.31.self_attn.dense.weight shape: [2560, 2560]
+363: model.layers.31.self_attn.k_proj.bias shape: [2560]
+364: model.layers.31.self_attn.k_proj.weight shape: [2560, 2560]
+365: model.layers.31.self_attn.q_proj.bias shape: [2560]
+366: model.layers.31.self_attn.q_proj.weight shape: [2560, 2560]
+367: model.layers.31.self_attn.v_proj.bias shape: [2560]
+368: model.layers.31.self_attn.v_proj.weight shape: [2560, 2560]
+369: model.layers.4.input_layernorm.bias shape: [2560]
+370: model.layers.4.input_layernorm.weight shape: [2560]
+371: model.layers.4.mlp.fc1.bias shape: [10240]
+372: model.layers.4.mlp.fc1.weight shape: [10240, 2560]
+373: model.layers.4.mlp.fc2.bias shape: [2560]
+374: model.layers.4.mlp.fc2.weight shape: [2560, 10240]
+375: model.layers.4.self_attn.dense.bias shape: [2560]
+376: model.layers.4.self_attn.dense.weight shape: [2560, 2560]
+377: model.layers.4.self_attn.k_proj.bias shape: [2560]
+378: model.layers.4.self_attn.k_proj.weight shape: [2560, 2560]
+379: model.layers.4.self_attn.q_proj.bias shape: [2560]
+380: model.layers.4.self_attn.q_proj.weight shape: [2560, 2560]
+381: model.layers.4.self_attn.v_proj.bias shape: [2560]
+382: model.layers.4.self_attn.v_proj.weight shape: [2560, 2560]
+383: model.layers.5.input_layernorm.bias shape: [2560]
+384: model.layers.5.input_layernorm.weight shape: [2560]
+385: model.layers.5.mlp.fc1.bias shape: [10240]
+386: model.layers.5.mlp.fc1.weight shape: [10240, 2560]
+387: model.layers.5.mlp.fc2.bias shape: [2560]
+388: model.layers.5.mlp.fc2.weight shape: [2560, 10240]
+389: model.layers.5.self_attn.dense.bias shape: [2560]
+390: model.layers.5.self_attn.dense.weight shape: [2560, 2560]
+391: model.layers.5.self_attn.k_proj.bias shape: [2560]
+392: model.layers.5.self_attn.k_proj.weight shape: [2560, 2560]
+393: model.layers.5.self_attn.q_proj.bias shape: [2560]
+394: model.layers.5.self_attn.q_proj.weight shape: [2560, 2560]
+395: model.layers.5.self_attn.v_proj.bias shape: [2560]
+396: model.layers.5.self_attn.v_proj.weight shape: [2560, 2560]
+397: model.layers.6.input_layernorm.bias shape: [2560]
+398: model.layers.6.input_layernorm.weight shape: [2560]
+399: model.layers.6.mlp.fc1.bias shape: [10240]
+400: model.layers.6.mlp.fc1.weight shape: [10240, 2560]
+401: model.layers.6.mlp.fc2.bias shape: [2560]
+402: model.layers.6.mlp.fc2.weight shape: [2560, 10240]
+403: model.layers.6.self_attn.dense.bias shape: [2560]
+404: model.layers.6.self_attn.dense.weight shape: [2560, 2560]
+405: model.layers.6.self_attn.k_proj.bias shape: [2560]
+406: model.layers.6.self_attn.k_proj.weight shape: [2560, 2560]
+407: model.layers.6.self_attn.q_proj.bias shape: [2560]
+408: model.layers.6.self_attn.q_proj.weight shape: [2560, 2560]
+409: model.layers.6.self_attn.v_proj.bias shape: [2560]
+410: model.layers.6.self_attn.v_proj.weight shape: [2560, 2560]
+411: model.layers.7.input_layernorm.bias shape: [2560]
+412: model.layers.7.input_layernorm.weight shape: [2560]
+413: model.layers.7.mlp.fc1.bias shape: [10240]
+414: model.layers.7.mlp.fc1.weight shape: [10240, 2560]
+415: model.layers.7.mlp.fc2.bias shape: [2560]
+416: model.layers.7.mlp.fc2.weight shape: [2560, 10240]
+417: model.layers.7.self_attn.dense.bias shape: [2560]
+418: model.layers.7.self_attn.dense.weight shape: [2560, 2560]
+419: model.layers.7.self_attn.k_proj.bias shape: [2560]
+420: model.layers.7.self_attn.k_proj.weight shape: [2560, 2560]
+421: model.layers.7.self_attn.q_proj.bias shape: [2560]
+422: model.layers.7.self_attn.q_proj.weight shape: [2560, 2560]
+423: model.layers.7.self_attn.v_proj.bias shape: [2560]
+424: model.layers.7.self_attn.v_proj.weight shape: [2560, 2560]
+425: model.layers.8.input_layernorm.bias shape: [2560]
+426: model.layers.8.input_layernorm.weight shape: [2560]
+427: model.layers.8.mlp.fc1.bias shape: [10240]
+428: model.layers.8.mlp.fc1.weight shape: [10240, 2560]
+429: model.layers.8.mlp.fc2.bias shape: [2560]
+430: model.layers.8.mlp.fc2.weight shape: [2560, 10240]
+431: model.layers.8.self_attn.dense.bias shape: [2560]
+432: model.layers.8.self_attn.dense.weight shape: [2560, 2560]
+433: model.layers.8.self_attn.k_proj.bias shape: [2560]
+434: model.layers.8.self_attn.k_proj.weight shape: [2560, 2560]
+435: model.layers.8.self_attn.q_proj.bias shape: [2560]
+436: model.layers.8.self_attn.q_proj.weight shape: [2560, 2560]
+437: model.layers.8.self_attn.v_proj.bias shape: [2560]
+438: model.layers.8.self_attn.v_proj.weight shape: [2560, 2560]
+439: model.layers.9.input_layernorm.bias shape: [2560]
+440: model.layers.9.input_layernorm.weight shape: [2560]
+441: model.layers.9.mlp.fc1.bias shape: [10240]
+442: model.layers.9.mlp.fc1.weight shape: [10240, 2560]
+443: model.layers.9.mlp.fc2.bias shape: [2560]
+444: model.layers.9.mlp.fc2.weight shape: [2560, 10240]
+445: model.layers.9.self_attn.dense.bias shape: [2560]
+446: model.layers.9.self_attn.dense.weight shape: [2560, 2560]
+447: model.layers.9.self_attn.k_proj.bias shape: [2560]
+448: model.layers.9.self_attn.k_proj.weight shape: [2560, 2560]
+449: model.layers.9.self_attn.q_proj.bias shape: [2560]
+450: model.layers.9.self_attn.q_proj.weight shape: [2560, 2560]
+451: model.layers.9.self_attn.v_proj.bias shape: [2560]
+452: model.layers.9.self_attn.v_proj.weight shape: [2560, 2560]
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Test.TokenizerTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Test.TokenizerTest.approved.txt
new file mode 100644
index 0000000000..7338548917
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Test.TokenizerTest.approved.txt
@@ -0,0 +1,3 @@
+50256, 6090, 345, 2148, 2842, 284, 4483, 17790, 286, 35484, 290, 10441, 69, 50187, 30
+50256, 19457, 0, 3423, 389, 617, 2842, 284, 4483, 35484, 290, 10441, 69, 50187, 1978, 25, 352, 13, 40058, 290, 10441, 34711, 7209, 494, 25, 41198, 35484, 290, 10441, 69, 50187, 1978, 351, 617, 7545, 290, 12498, 13, 362, 13, 40058, 290, 10441, 34711, 20698, 25, 15561, 26790, 35484, 290, 10441, 69, 50187, 1978, 351, 617, 18873, 13135, 290, 12498, 13
+50256, 2061, 546, 18120, 281, 362, 87, 1343, 513, 796, 767, 16022, 30
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Tests.LoadSafeTensorShapeTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Tests.LoadSafeTensorShapeTest.approved.txt
new file mode 100644
index 0000000000..75e17ad1a6
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Tests.LoadSafeTensorShapeTest.approved.txt
@@ -0,0 +1,453 @@
+0: lm_head.bias shape: [51200]
+1: lm_head.weight shape: [51200, 2560]
+2: model.embed_tokens.weight shape: [51200, 2560]
+3: model.final_layernorm.bias shape: [2560]
+4: model.final_layernorm.weight shape: [2560]
+5: model.layers.0.input_layernorm.bias shape: [2560]
+6: model.layers.0.input_layernorm.weight shape: [2560]
+7: model.layers.0.mlp.fc1.bias shape: [10240]
+8: model.layers.0.mlp.fc1.weight shape: [10240, 2560]
+9: model.layers.0.mlp.fc2.bias shape: [2560]
+10: model.layers.0.mlp.fc2.weight shape: [2560, 10240]
+11: model.layers.0.self_attn.dense.bias shape: [2560]
+12: model.layers.0.self_attn.dense.weight shape: [2560, 2560]
+13: model.layers.0.self_attn.k_proj.bias shape: [2560]
+14: model.layers.0.self_attn.k_proj.weight shape: [2560, 2560]
+15: model.layers.0.self_attn.q_proj.bias shape: [2560]
+16: model.layers.0.self_attn.q_proj.weight shape: [2560, 2560]
+17: model.layers.0.self_attn.v_proj.bias shape: [2560]
+18: model.layers.0.self_attn.v_proj.weight shape: [2560, 2560]
+19: model.layers.1.input_layernorm.bias shape: [2560]
+20: model.layers.1.input_layernorm.weight shape: [2560]
+21: model.layers.1.mlp.fc1.bias shape: [10240]
+22: model.layers.1.mlp.fc1.weight shape: [10240, 2560]
+23: model.layers.1.mlp.fc2.bias shape: [2560]
+24: model.layers.1.mlp.fc2.weight shape: [2560, 10240]
+25: model.layers.1.self_attn.dense.bias shape: [2560]
+26: model.layers.1.self_attn.dense.weight shape: [2560, 2560]
+27: model.layers.1.self_attn.k_proj.bias shape: [2560]
+28: model.layers.1.self_attn.k_proj.weight shape: [2560, 2560]
+29: model.layers.1.self_attn.q_proj.bias shape: [2560]
+30: model.layers.1.self_attn.q_proj.weight shape: [2560, 2560]
+31: model.layers.1.self_attn.v_proj.bias shape: [2560]
+32: model.layers.1.self_attn.v_proj.weight shape: [2560, 2560]
+33: model.layers.10.input_layernorm.bias shape: [2560]
+34: model.layers.10.input_layernorm.weight shape: [2560]
+35: model.layers.10.mlp.fc1.bias shape: [10240]
+36: model.layers.10.mlp.fc1.weight shape: [10240, 2560]
+37: model.layers.10.mlp.fc2.bias shape: [2560]
+38: model.layers.10.mlp.fc2.weight shape: [2560, 10240]
+39: model.layers.10.self_attn.dense.bias shape: [2560]
+40: model.layers.10.self_attn.dense.weight shape: [2560, 2560]
+41: model.layers.10.self_attn.k_proj.bias shape: [2560]
+42: model.layers.10.self_attn.k_proj.weight shape: [2560, 2560]
+43: model.layers.10.self_attn.q_proj.bias shape: [2560]
+44: model.layers.10.self_attn.q_proj.weight shape: [2560, 2560]
+45: model.layers.10.self_attn.v_proj.bias shape: [2560]
+46: model.layers.10.self_attn.v_proj.weight shape: [2560, 2560]
+47: model.layers.11.input_layernorm.bias shape: [2560]
+48: model.layers.11.input_layernorm.weight shape: [2560]
+49: model.layers.11.mlp.fc1.bias shape: [10240]
+50: model.layers.11.mlp.fc1.weight shape: [10240, 2560]
+51: model.layers.11.mlp.fc2.bias shape: [2560]
+52: model.layers.11.mlp.fc2.weight shape: [2560, 10240]
+53: model.layers.11.self_attn.dense.bias shape: [2560]
+54: model.layers.11.self_attn.dense.weight shape: [2560, 2560]
+55: model.layers.11.self_attn.k_proj.bias shape: [2560]
+56: model.layers.11.self_attn.k_proj.weight shape: [2560, 2560]
+57: model.layers.11.self_attn.q_proj.bias shape: [2560]
+58: model.layers.11.self_attn.q_proj.weight shape: [2560, 2560]
+59: model.layers.11.self_attn.v_proj.bias shape: [2560]
+60: model.layers.11.self_attn.v_proj.weight shape: [2560, 2560]
+61: model.layers.12.input_layernorm.bias shape: [2560]
+62: model.layers.12.input_layernorm.weight shape: [2560]
+63: model.layers.12.mlp.fc1.bias shape: [10240]
+64: model.layers.12.mlp.fc1.weight shape: [10240, 2560]
+65: model.layers.12.mlp.fc2.bias shape: [2560]
+66: model.layers.12.mlp.fc2.weight shape: [2560, 10240]
+67: model.layers.12.self_attn.dense.bias shape: [2560]
+68: model.layers.12.self_attn.dense.weight shape: [2560, 2560]
+69: model.layers.12.self_attn.k_proj.bias shape: [2560]
+70: model.layers.12.self_attn.k_proj.weight shape: [2560, 2560]
+71: model.layers.12.self_attn.q_proj.bias shape: [2560]
+72: model.layers.12.self_attn.q_proj.weight shape: [2560, 2560]
+73: model.layers.12.self_attn.v_proj.bias shape: [2560]
+74: model.layers.12.self_attn.v_proj.weight shape: [2560, 2560]
+75: model.layers.13.input_layernorm.bias shape: [2560]
+76: model.layers.13.input_layernorm.weight shape: [2560]
+77: model.layers.13.mlp.fc1.bias shape: [10240]
+78: model.layers.13.mlp.fc1.weight shape: [10240, 2560]
+79: model.layers.13.mlp.fc2.bias shape: [2560]
+80: model.layers.13.mlp.fc2.weight shape: [2560, 10240]
+81: model.layers.13.self_attn.dense.bias shape: [2560]
+82: model.layers.13.self_attn.dense.weight shape: [2560, 2560]
+83: model.layers.13.self_attn.k_proj.bias shape: [2560]
+84: model.layers.13.self_attn.k_proj.weight shape: [2560, 2560]
+85: model.layers.13.self_attn.q_proj.bias shape: [2560]
+86: model.layers.13.self_attn.q_proj.weight shape: [2560, 2560]
+87: model.layers.13.self_attn.v_proj.bias shape: [2560]
+88: model.layers.13.self_attn.v_proj.weight shape: [2560, 2560]
+89: model.layers.14.input_layernorm.bias shape: [2560]
+90: model.layers.14.input_layernorm.weight shape: [2560]
+91: model.layers.14.mlp.fc1.bias shape: [10240]
+92: model.layers.14.mlp.fc1.weight shape: [10240, 2560]
+93: model.layers.14.mlp.fc2.bias shape: [2560]
+94: model.layers.14.mlp.fc2.weight shape: [2560, 10240]
+95: model.layers.14.self_attn.dense.bias shape: [2560]
+96: model.layers.14.self_attn.dense.weight shape: [2560, 2560]
+97: model.layers.14.self_attn.k_proj.bias shape: [2560]
+98: model.layers.14.self_attn.k_proj.weight shape: [2560, 2560]
+99: model.layers.14.self_attn.q_proj.bias shape: [2560]
+100: model.layers.14.self_attn.q_proj.weight shape: [2560, 2560]
+101: model.layers.14.self_attn.v_proj.bias shape: [2560]
+102: model.layers.14.self_attn.v_proj.weight shape: [2560, 2560]
+103: model.layers.15.input_layernorm.bias shape: [2560]
+104: model.layers.15.input_layernorm.weight shape: [2560]
+105: model.layers.15.mlp.fc1.bias shape: [10240]
+106: model.layers.15.mlp.fc1.weight shape: [10240, 2560]
+107: model.layers.15.mlp.fc2.bias shape: [2560]
+108: model.layers.15.mlp.fc2.weight shape: [2560, 10240]
+109: model.layers.15.self_attn.dense.bias shape: [2560]
+110: model.layers.15.self_attn.dense.weight shape: [2560, 2560]
+111: model.layers.15.self_attn.k_proj.bias shape: [2560]
+112: model.layers.15.self_attn.k_proj.weight shape: [2560, 2560]
+113: model.layers.15.self_attn.q_proj.bias shape: [2560]
+114: model.layers.15.self_attn.q_proj.weight shape: [2560, 2560]
+115: model.layers.15.self_attn.v_proj.bias shape: [2560]
+116: model.layers.15.self_attn.v_proj.weight shape: [2560, 2560]
+117: model.layers.16.input_layernorm.bias shape: [2560]
+118: model.layers.16.input_layernorm.weight shape: [2560]
+119: model.layers.16.mlp.fc1.bias shape: [10240]
+120: model.layers.16.mlp.fc1.weight shape: [10240, 2560]
+121: model.layers.16.mlp.fc2.bias shape: [2560]
+122: model.layers.16.mlp.fc2.weight shape: [2560, 10240]
+123: model.layers.16.self_attn.dense.bias shape: [2560]
+124: model.layers.16.self_attn.dense.weight shape: [2560, 2560]
+125: model.layers.16.self_attn.k_proj.bias shape: [2560]
+126: model.layers.16.self_attn.k_proj.weight shape: [2560, 2560]
+127: model.layers.16.self_attn.q_proj.bias shape: [2560]
+128: model.layers.16.self_attn.q_proj.weight shape: [2560, 2560]
+129: model.layers.16.self_attn.v_proj.bias shape: [2560]
+130: model.layers.16.self_attn.v_proj.weight shape: [2560, 2560]
+131: model.layers.17.input_layernorm.bias shape: [2560]
+132: model.layers.17.input_layernorm.weight shape: [2560]
+133: model.layers.17.mlp.fc1.bias shape: [10240]
+134: model.layers.17.mlp.fc1.weight shape: [10240, 2560]
+135: model.layers.17.mlp.fc2.bias shape: [2560]
+136: model.layers.17.mlp.fc2.weight shape: [2560, 10240]
+137: model.layers.17.self_attn.dense.bias shape: [2560]
+138: model.layers.17.self_attn.dense.weight shape: [2560, 2560]
+139: model.layers.17.self_attn.k_proj.bias shape: [2560]
+140: model.layers.17.self_attn.k_proj.weight shape: [2560, 2560]
+141: model.layers.17.self_attn.q_proj.bias shape: [2560]
+142: model.layers.17.self_attn.q_proj.weight shape: [2560, 2560]
+143: model.layers.17.self_attn.v_proj.bias shape: [2560]
+144: model.layers.17.self_attn.v_proj.weight shape: [2560, 2560]
+145: model.layers.18.input_layernorm.bias shape: [2560]
+146: model.layers.18.input_layernorm.weight shape: [2560]
+147: model.layers.18.mlp.fc1.bias shape: [10240]
+148: model.layers.18.mlp.fc1.weight shape: [10240, 2560]
+149: model.layers.18.mlp.fc2.bias shape: [2560]
+150: model.layers.18.mlp.fc2.weight shape: [2560, 10240]
+151: model.layers.18.self_attn.dense.bias shape: [2560]
+152: model.layers.18.self_attn.dense.weight shape: [2560, 2560]
+153: model.layers.18.self_attn.k_proj.bias shape: [2560]
+154: model.layers.18.self_attn.k_proj.weight shape: [2560, 2560]
+155: model.layers.18.self_attn.q_proj.bias shape: [2560]
+156: model.layers.18.self_attn.q_proj.weight shape: [2560, 2560]
+157: model.layers.18.self_attn.v_proj.bias shape: [2560]
+158: model.layers.18.self_attn.v_proj.weight shape: [2560, 2560]
+159: model.layers.19.input_layernorm.bias shape: [2560]
+160: model.layers.19.input_layernorm.weight shape: [2560]
+161: model.layers.19.mlp.fc1.bias shape: [10240]
+162: model.layers.19.mlp.fc1.weight shape: [10240, 2560]
+163: model.layers.19.mlp.fc2.bias shape: [2560]
+164: model.layers.19.mlp.fc2.weight shape: [2560, 10240]
+165: model.layers.19.self_attn.dense.bias shape: [2560]
+166: model.layers.19.self_attn.dense.weight shape: [2560, 2560]
+167: model.layers.19.self_attn.k_proj.bias shape: [2560]
+168: model.layers.19.self_attn.k_proj.weight shape: [2560, 2560]
+169: model.layers.19.self_attn.q_proj.bias shape: [2560]
+170: model.layers.19.self_attn.q_proj.weight shape: [2560, 2560]
+171: model.layers.19.self_attn.v_proj.bias shape: [2560]
+172: model.layers.19.self_attn.v_proj.weight shape: [2560, 2560]
+173: model.layers.2.input_layernorm.bias shape: [2560]
+174: model.layers.2.input_layernorm.weight shape: [2560]
+175: model.layers.2.mlp.fc1.bias shape: [10240]
+176: model.layers.2.mlp.fc1.weight shape: [10240, 2560]
+177: model.layers.2.mlp.fc2.bias shape: [2560]
+178: model.layers.2.mlp.fc2.weight shape: [2560, 10240]
+179: model.layers.2.self_attn.dense.bias shape: [2560]
+180: model.layers.2.self_attn.dense.weight shape: [2560, 2560]
+181: model.layers.2.self_attn.k_proj.bias shape: [2560]
+182: model.layers.2.self_attn.k_proj.weight shape: [2560, 2560]
+183: model.layers.2.self_attn.q_proj.bias shape: [2560]
+184: model.layers.2.self_attn.q_proj.weight shape: [2560, 2560]
+185: model.layers.2.self_attn.v_proj.bias shape: [2560]
+186: model.layers.2.self_attn.v_proj.weight shape: [2560, 2560]
+187: model.layers.20.input_layernorm.bias shape: [2560]
+188: model.layers.20.input_layernorm.weight shape: [2560]
+189: model.layers.20.mlp.fc1.bias shape: [10240]
+190: model.layers.20.mlp.fc1.weight shape: [10240, 2560]
+191: model.layers.20.mlp.fc2.bias shape: [2560]
+192: model.layers.20.mlp.fc2.weight shape: [2560, 10240]
+193: model.layers.20.self_attn.dense.bias shape: [2560]
+194: model.layers.20.self_attn.dense.weight shape: [2560, 2560]
+195: model.layers.20.self_attn.k_proj.bias shape: [2560]
+196: model.layers.20.self_attn.k_proj.weight shape: [2560, 2560]
+197: model.layers.20.self_attn.q_proj.bias shape: [2560]
+198: model.layers.20.self_attn.q_proj.weight shape: [2560, 2560]
+199: model.layers.20.self_attn.v_proj.bias shape: [2560]
+200: model.layers.20.self_attn.v_proj.weight shape: [2560, 2560]
+201: model.layers.21.input_layernorm.bias shape: [2560]
+202: model.layers.21.input_layernorm.weight shape: [2560]
+203: model.layers.21.mlp.fc1.bias shape: [10240]
+204: model.layers.21.mlp.fc1.weight shape: [10240, 2560]
+205: model.layers.21.mlp.fc2.bias shape: [2560]
+206: model.layers.21.mlp.fc2.weight shape: [2560, 10240]
+207: model.layers.21.self_attn.dense.bias shape: [2560]
+208: model.layers.21.self_attn.dense.weight shape: [2560, 2560]
+209: model.layers.21.self_attn.k_proj.bias shape: [2560]
+210: model.layers.21.self_attn.k_proj.weight shape: [2560, 2560]
+211: model.layers.21.self_attn.q_proj.bias shape: [2560]
+212: model.layers.21.self_attn.q_proj.weight shape: [2560, 2560]
+213: model.layers.21.self_attn.v_proj.bias shape: [2560]
+214: model.layers.21.self_attn.v_proj.weight shape: [2560, 2560]
+215: model.layers.22.input_layernorm.bias shape: [2560]
+216: model.layers.22.input_layernorm.weight shape: [2560]
+217: model.layers.22.mlp.fc1.bias shape: [10240]
+218: model.layers.22.mlp.fc1.weight shape: [10240, 2560]
+219: model.layers.22.mlp.fc2.bias shape: [2560]
+220: model.layers.22.mlp.fc2.weight shape: [2560, 10240]
+221: model.layers.22.self_attn.dense.bias shape: [2560]
+222: model.layers.22.self_attn.dense.weight shape: [2560, 2560]
+223: model.layers.22.self_attn.k_proj.bias shape: [2560]
+224: model.layers.22.self_attn.k_proj.weight shape: [2560, 2560]
+225: model.layers.22.self_attn.q_proj.bias shape: [2560]
+226: model.layers.22.self_attn.q_proj.weight shape: [2560, 2560]
+227: model.layers.22.self_attn.v_proj.bias shape: [2560]
+228: model.layers.22.self_attn.v_proj.weight shape: [2560, 2560]
+229: model.layers.23.input_layernorm.bias shape: [2560]
+230: model.layers.23.input_layernorm.weight shape: [2560]
+231: model.layers.23.mlp.fc1.bias shape: [10240]
+232: model.layers.23.mlp.fc1.weight shape: [10240, 2560]
+233: model.layers.23.mlp.fc2.bias shape: [2560]
+234: model.layers.23.mlp.fc2.weight shape: [2560, 10240]
+235: model.layers.23.self_attn.dense.bias shape: [2560]
+236: model.layers.23.self_attn.dense.weight shape: [2560, 2560]
+237: model.layers.23.self_attn.k_proj.bias shape: [2560]
+238: model.layers.23.self_attn.k_proj.weight shape: [2560, 2560]
+239: model.layers.23.self_attn.q_proj.bias shape: [2560]
+240: model.layers.23.self_attn.q_proj.weight shape: [2560, 2560]
+241: model.layers.23.self_attn.v_proj.bias shape: [2560]
+242: model.layers.23.self_attn.v_proj.weight shape: [2560, 2560]
+243: model.layers.24.input_layernorm.bias shape: [2560]
+244: model.layers.24.input_layernorm.weight shape: [2560]
+245: model.layers.24.mlp.fc1.bias shape: [10240]
+246: model.layers.24.mlp.fc1.weight shape: [10240, 2560]
+247: model.layers.24.mlp.fc2.bias shape: [2560]
+248: model.layers.24.mlp.fc2.weight shape: [2560, 10240]
+249: model.layers.24.self_attn.dense.bias shape: [2560]
+250: model.layers.24.self_attn.dense.weight shape: [2560, 2560]
+251: model.layers.24.self_attn.k_proj.bias shape: [2560]
+252: model.layers.24.self_attn.k_proj.weight shape: [2560, 2560]
+253: model.layers.24.self_attn.q_proj.bias shape: [2560]
+254: model.layers.24.self_attn.q_proj.weight shape: [2560, 2560]
+255: model.layers.24.self_attn.v_proj.bias shape: [2560]
+256: model.layers.24.self_attn.v_proj.weight shape: [2560, 2560]
+257: model.layers.25.input_layernorm.bias shape: [2560]
+258: model.layers.25.input_layernorm.weight shape: [2560]
+259: model.layers.25.mlp.fc1.bias shape: [10240]
+260: model.layers.25.mlp.fc1.weight shape: [10240, 2560]
+261: model.layers.25.mlp.fc2.bias shape: [2560]
+262: model.layers.25.mlp.fc2.weight shape: [2560, 10240]
+263: model.layers.25.self_attn.dense.bias shape: [2560]
+264: model.layers.25.self_attn.dense.weight shape: [2560, 2560]
+265: model.layers.25.self_attn.k_proj.bias shape: [2560]
+266: model.layers.25.self_attn.k_proj.weight shape: [2560, 2560]
+267: model.layers.25.self_attn.q_proj.bias shape: [2560]
+268: model.layers.25.self_attn.q_proj.weight shape: [2560, 2560]
+269: model.layers.25.self_attn.v_proj.bias shape: [2560]
+270: model.layers.25.self_attn.v_proj.weight shape: [2560, 2560]
+271: model.layers.26.input_layernorm.bias shape: [2560]
+272: model.layers.26.input_layernorm.weight shape: [2560]
+273: model.layers.26.mlp.fc1.bias shape: [10240]
+274: model.layers.26.mlp.fc1.weight shape: [10240, 2560]
+275: model.layers.26.mlp.fc2.bias shape: [2560]
+276: model.layers.26.mlp.fc2.weight shape: [2560, 10240]
+277: model.layers.26.self_attn.dense.bias shape: [2560]
+278: model.layers.26.self_attn.dense.weight shape: [2560, 2560]
+279: model.layers.26.self_attn.k_proj.bias shape: [2560]
+280: model.layers.26.self_attn.k_proj.weight shape: [2560, 2560]
+281: model.layers.26.self_attn.q_proj.bias shape: [2560]
+282: model.layers.26.self_attn.q_proj.weight shape: [2560, 2560]
+283: model.layers.26.self_attn.v_proj.bias shape: [2560]
+284: model.layers.26.self_attn.v_proj.weight shape: [2560, 2560]
+285: model.layers.27.input_layernorm.bias shape: [2560]
+286: model.layers.27.input_layernorm.weight shape: [2560]
+287: model.layers.27.mlp.fc1.bias shape: [10240]
+288: model.layers.27.mlp.fc1.weight shape: [10240, 2560]
+289: model.layers.27.mlp.fc2.bias shape: [2560]
+290: model.layers.27.mlp.fc2.weight shape: [2560, 10240]
+291: model.layers.27.self_attn.dense.bias shape: [2560]
+292: model.layers.27.self_attn.dense.weight shape: [2560, 2560]
+293: model.layers.27.self_attn.k_proj.bias shape: [2560]
+294: model.layers.27.self_attn.k_proj.weight shape: [2560, 2560]
+295: model.layers.27.self_attn.q_proj.bias shape: [2560]
+296: model.layers.27.self_attn.q_proj.weight shape: [2560, 2560]
+297: model.layers.27.self_attn.v_proj.bias shape: [2560]
+298: model.layers.27.self_attn.v_proj.weight shape: [2560, 2560]
+299: model.layers.28.input_layernorm.bias shape: [2560]
+300: model.layers.28.input_layernorm.weight shape: [2560]
+301: model.layers.28.mlp.fc1.bias shape: [10240]
+302: model.layers.28.mlp.fc1.weight shape: [10240, 2560]
+303: model.layers.28.mlp.fc2.bias shape: [2560]
+304: model.layers.28.mlp.fc2.weight shape: [2560, 10240]
+305: model.layers.28.self_attn.dense.bias shape: [2560]
+306: model.layers.28.self_attn.dense.weight shape: [2560, 2560]
+307: model.layers.28.self_attn.k_proj.bias shape: [2560]
+308: model.layers.28.self_attn.k_proj.weight shape: [2560, 2560]
+309: model.layers.28.self_attn.q_proj.bias shape: [2560]
+310: model.layers.28.self_attn.q_proj.weight shape: [2560, 2560]
+311: model.layers.28.self_attn.v_proj.bias shape: [2560]
+312: model.layers.28.self_attn.v_proj.weight shape: [2560, 2560]
+313: model.layers.29.input_layernorm.bias shape: [2560]
+314: model.layers.29.input_layernorm.weight shape: [2560]
+315: model.layers.29.mlp.fc1.bias shape: [10240]
+316: model.layers.29.mlp.fc1.weight shape: [10240, 2560]
+317: model.layers.29.mlp.fc2.bias shape: [2560]
+318: model.layers.29.mlp.fc2.weight shape: [2560, 10240]
+319: model.layers.29.self_attn.dense.bias shape: [2560]
+320: model.layers.29.self_attn.dense.weight shape: [2560, 2560]
+321: model.layers.29.self_attn.k_proj.bias shape: [2560]
+322: model.layers.29.self_attn.k_proj.weight shape: [2560, 2560]
+323: model.layers.29.self_attn.q_proj.bias shape: [2560]
+324: model.layers.29.self_attn.q_proj.weight shape: [2560, 2560]
+325: model.layers.29.self_attn.v_proj.bias shape: [2560]
+326: model.layers.29.self_attn.v_proj.weight shape: [2560, 2560]
+327: model.layers.3.input_layernorm.bias shape: [2560]
+328: model.layers.3.input_layernorm.weight shape: [2560]
+329: model.layers.3.mlp.fc1.bias shape: [10240]
+330: model.layers.3.mlp.fc1.weight shape: [10240, 2560]
+331: model.layers.3.mlp.fc2.bias shape: [2560]
+332: model.layers.3.mlp.fc2.weight shape: [2560, 10240]
+333: model.layers.3.self_attn.dense.bias shape: [2560]
+334: model.layers.3.self_attn.dense.weight shape: [2560, 2560]
+335: model.layers.3.self_attn.k_proj.bias shape: [2560]
+336: model.layers.3.self_attn.k_proj.weight shape: [2560, 2560]
+337: model.layers.3.self_attn.q_proj.bias shape: [2560]
+338: model.layers.3.self_attn.q_proj.weight shape: [2560, 2560]
+339: model.layers.3.self_attn.v_proj.bias shape: [2560]
+340: model.layers.3.self_attn.v_proj.weight shape: [2560, 2560]
+341: model.layers.30.input_layernorm.bias shape: [2560]
+342: model.layers.30.input_layernorm.weight shape: [2560]
+343: model.layers.30.mlp.fc1.bias shape: [10240]
+344: model.layers.30.mlp.fc1.weight shape: [10240, 2560]
+345: model.layers.30.mlp.fc2.bias shape: [2560]
+346: model.layers.30.mlp.fc2.weight shape: [2560, 10240]
+347: model.layers.30.self_attn.dense.bias shape: [2560]
+348: model.layers.30.self_attn.dense.weight shape: [2560, 2560]
+349: model.layers.30.self_attn.k_proj.bias shape: [2560]
+350: model.layers.30.self_attn.k_proj.weight shape: [2560, 2560]
+351: model.layers.30.self_attn.q_proj.bias shape: [2560]
+352: model.layers.30.self_attn.q_proj.weight shape: [2560, 2560]
+353: model.layers.30.self_attn.v_proj.bias shape: [2560]
+354: model.layers.30.self_attn.v_proj.weight shape: [2560, 2560]
+355: model.layers.31.input_layernorm.bias shape: [2560]
+356: model.layers.31.input_layernorm.weight shape: [2560]
+357: model.layers.31.mlp.fc1.bias shape: [10240]
+358: model.layers.31.mlp.fc1.weight shape: [10240, 2560]
+359: model.layers.31.mlp.fc2.bias shape: [2560]
+360: model.layers.31.mlp.fc2.weight shape: [2560, 10240]
+361: model.layers.31.self_attn.dense.bias shape: [2560]
+362: model.layers.31.self_attn.dense.weight shape: [2560, 2560]
+363: model.layers.31.self_attn.k_proj.bias shape: [2560]
+364: model.layers.31.self_attn.k_proj.weight shape: [2560, 2560]
+365: model.layers.31.self_attn.q_proj.bias shape: [2560]
+366: model.layers.31.self_attn.q_proj.weight shape: [2560, 2560]
+367: model.layers.31.self_attn.v_proj.bias shape: [2560]
+368: model.layers.31.self_attn.v_proj.weight shape: [2560, 2560]
+369: model.layers.4.input_layernorm.bias shape: [2560]
+370: model.layers.4.input_layernorm.weight shape: [2560]
+371: model.layers.4.mlp.fc1.bias shape: [10240]
+372: model.layers.4.mlp.fc1.weight shape: [10240, 2560]
+373: model.layers.4.mlp.fc2.bias shape: [2560]
+374: model.layers.4.mlp.fc2.weight shape: [2560, 10240]
+375: model.layers.4.self_attn.dense.bias shape: [2560]
+376: model.layers.4.self_attn.dense.weight shape: [2560, 2560]
+377: model.layers.4.self_attn.k_proj.bias shape: [2560]
+378: model.layers.4.self_attn.k_proj.weight shape: [2560, 2560]
+379: model.layers.4.self_attn.q_proj.bias shape: [2560]
+380: model.layers.4.self_attn.q_proj.weight shape: [2560, 2560]
+381: model.layers.4.self_attn.v_proj.bias shape: [2560]
+382: model.layers.4.self_attn.v_proj.weight shape: [2560, 2560]
+383: model.layers.5.input_layernorm.bias shape: [2560]
+384: model.layers.5.input_layernorm.weight shape: [2560]
+385: model.layers.5.mlp.fc1.bias shape: [10240]
+386: model.layers.5.mlp.fc1.weight shape: [10240, 2560]
+387: model.layers.5.mlp.fc2.bias shape: [2560]
+388: model.layers.5.mlp.fc2.weight shape: [2560, 10240]
+389: model.layers.5.self_attn.dense.bias shape: [2560]
+390: model.layers.5.self_attn.dense.weight shape: [2560, 2560]
+391: model.layers.5.self_attn.k_proj.bias shape: [2560]
+392: model.layers.5.self_attn.k_proj.weight shape: [2560, 2560]
+393: model.layers.5.self_attn.q_proj.bias shape: [2560]
+394: model.layers.5.self_attn.q_proj.weight shape: [2560, 2560]
+395: model.layers.5.self_attn.v_proj.bias shape: [2560]
+396: model.layers.5.self_attn.v_proj.weight shape: [2560, 2560]
+397: model.layers.6.input_layernorm.bias shape: [2560]
+398: model.layers.6.input_layernorm.weight shape: [2560]
+399: model.layers.6.mlp.fc1.bias shape: [10240]
+400: model.layers.6.mlp.fc1.weight shape: [10240, 2560]
+401: model.layers.6.mlp.fc2.bias shape: [2560]
+402: model.layers.6.mlp.fc2.weight shape: [2560, 10240]
+403: model.layers.6.self_attn.dense.bias shape: [2560]
+404: model.layers.6.self_attn.dense.weight shape: [2560, 2560]
+405: model.layers.6.self_attn.k_proj.bias shape: [2560]
+406: model.layers.6.self_attn.k_proj.weight shape: [2560, 2560]
+407: model.layers.6.self_attn.q_proj.bias shape: [2560]
+408: model.layers.6.self_attn.q_proj.weight shape: [2560, 2560]
+409: model.layers.6.self_attn.v_proj.bias shape: [2560]
+410: model.layers.6.self_attn.v_proj.weight shape: [2560, 2560]
+411: model.layers.7.input_layernorm.bias shape: [2560]
+412: model.layers.7.input_layernorm.weight shape: [2560]
+413: model.layers.7.mlp.fc1.bias shape: [10240]
+414: model.layers.7.mlp.fc1.weight shape: [10240, 2560]
+415: model.layers.7.mlp.fc2.bias shape: [2560]
+416: model.layers.7.mlp.fc2.weight shape: [2560, 10240]
+417: model.layers.7.self_attn.dense.bias shape: [2560]
+418: model.layers.7.self_attn.dense.weight shape: [2560, 2560]
+419: model.layers.7.self_attn.k_proj.bias shape: [2560]
+420: model.layers.7.self_attn.k_proj.weight shape: [2560, 2560]
+421: model.layers.7.self_attn.q_proj.bias shape: [2560]
+422: model.layers.7.self_attn.q_proj.weight shape: [2560, 2560]
+423: model.layers.7.self_attn.v_proj.bias shape: [2560]
+424: model.layers.7.self_attn.v_proj.weight shape: [2560, 2560]
+425: model.layers.8.input_layernorm.bias shape: [2560]
+426: model.layers.8.input_layernorm.weight shape: [2560]
+427: model.layers.8.mlp.fc1.bias shape: [10240]
+428: model.layers.8.mlp.fc1.weight shape: [10240, 2560]
+429: model.layers.8.mlp.fc2.bias shape: [2560]
+430: model.layers.8.mlp.fc2.weight shape: [2560, 10240]
+431: model.layers.8.self_attn.dense.bias shape: [2560]
+432: model.layers.8.self_attn.dense.weight shape: [2560, 2560]
+433: model.layers.8.self_attn.k_proj.bias shape: [2560]
+434: model.layers.8.self_attn.k_proj.weight shape: [2560, 2560]
+435: model.layers.8.self_attn.q_proj.bias shape: [2560]
+436: model.layers.8.self_attn.q_proj.weight shape: [2560, 2560]
+437: model.layers.8.self_attn.v_proj.bias shape: [2560]
+438: model.layers.8.self_attn.v_proj.weight shape: [2560, 2560]
+439: model.layers.9.input_layernorm.bias shape: [2560]
+440: model.layers.9.input_layernorm.weight shape: [2560]
+441: model.layers.9.mlp.fc1.bias shape: [10240]
+442: model.layers.9.mlp.fc1.weight shape: [10240, 2560]
+443: model.layers.9.mlp.fc2.bias shape: [2560]
+444: model.layers.9.mlp.fc2.weight shape: [2560, 10240]
+445: model.layers.9.self_attn.dense.bias shape: [2560]
+446: model.layers.9.self_attn.dense.weight shape: [2560, 2560]
+447: model.layers.9.self_attn.k_proj.bias shape: [2560]
+448: model.layers.9.self_attn.k_proj.weight shape: [2560, 2560]
+449: model.layers.9.self_attn.q_proj.bias shape: [2560]
+450: model.layers.9.self_attn.q_proj.weight shape: [2560, 2560]
+451: model.layers.9.self_attn.v_proj.bias shape: [2560]
+452: model.layers.9.self_attn.v_proj.weight shape: [2560, 2560]
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Tests.TokenizerTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Tests.TokenizerTest.approved.txt
new file mode 100644
index 0000000000..7338548917
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi2Tests.TokenizerTest.approved.txt
@@ -0,0 +1,3 @@
+50256, 6090, 345, 2148, 2842, 284, 4483, 17790, 286, 35484, 290, 10441, 69, 50187, 30
+50256, 19457, 0, 3423, 389, 617, 2842, 284, 4483, 35484, 290, 10441, 69, 50187, 1978, 25, 352, 13, 40058, 290, 10441, 34711, 7209, 494, 25, 41198, 35484, 290, 10441, 69, 50187, 1978, 351, 617, 7545, 290, 12498, 13, 362, 13, 40058, 290, 10441, 34711, 20698, 25, 15561, 26790, 35484, 290, 10441, 69, 50187, 1978, 351, 617, 18873, 13135, 290, 12498, 13
+50256, 2061, 546, 18120, 281, 362, 87, 1343, 513, 796, 767, 16022, 30
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Medium128KShapeTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Medium128KShapeTest.approved.txt
new file mode 100644
index 0000000000..277f686aa7
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Medium128KShapeTest.approved.txt
@@ -0,0 +1,243 @@
+0: lm_head.weight shape: [32064, 5120]
+1: model.embed_tokens.weight shape: [32064, 5120]
+2: model.layers.0.input_layernorm.weight shape: [5120]
+3: model.layers.0.mlp.down_proj.weight shape: [5120, 17920]
+4: model.layers.0.mlp.gate_up_proj.weight shape: [35840, 5120]
+5: model.layers.0.post_attention_layernorm.weight shape: [5120]
+6: model.layers.0.self_attn.o_proj.weight shape: [5120, 5120]
+7: model.layers.0.self_attn.qkv_proj.weight shape: [7680, 5120]
+8: model.layers.1.input_layernorm.weight shape: [5120]
+9: model.layers.1.mlp.down_proj.weight shape: [5120, 17920]
+10: model.layers.1.mlp.gate_up_proj.weight shape: [35840, 5120]
+11: model.layers.1.post_attention_layernorm.weight shape: [5120]
+12: model.layers.1.self_attn.o_proj.weight shape: [5120, 5120]
+13: model.layers.1.self_attn.qkv_proj.weight shape: [7680, 5120]
+14: model.layers.10.input_layernorm.weight shape: [5120]
+15: model.layers.10.mlp.down_proj.weight shape: [5120, 17920]
+16: model.layers.10.mlp.gate_up_proj.weight shape: [35840, 5120]
+17: model.layers.10.post_attention_layernorm.weight shape: [5120]
+18: model.layers.10.self_attn.o_proj.weight shape: [5120, 5120]
+19: model.layers.10.self_attn.qkv_proj.weight shape: [7680, 5120]
+20: model.layers.11.input_layernorm.weight shape: [5120]
+21: model.layers.11.mlp.down_proj.weight shape: [5120, 17920]
+22: model.layers.11.mlp.gate_up_proj.weight shape: [35840, 5120]
+23: model.layers.11.post_attention_layernorm.weight shape: [5120]
+24: model.layers.11.self_attn.o_proj.weight shape: [5120, 5120]
+25: model.layers.11.self_attn.qkv_proj.weight shape: [7680, 5120]
+26: model.layers.12.input_layernorm.weight shape: [5120]
+27: model.layers.12.mlp.down_proj.weight shape: [5120, 17920]
+28: model.layers.12.mlp.gate_up_proj.weight shape: [35840, 5120]
+29: model.layers.12.post_attention_layernorm.weight shape: [5120]
+30: model.layers.12.self_attn.o_proj.weight shape: [5120, 5120]
+31: model.layers.12.self_attn.qkv_proj.weight shape: [7680, 5120]
+32: model.layers.13.input_layernorm.weight shape: [5120]
+33: model.layers.13.mlp.down_proj.weight shape: [5120, 17920]
+34: model.layers.13.mlp.gate_up_proj.weight shape: [35840, 5120]
+35: model.layers.13.post_attention_layernorm.weight shape: [5120]
+36: model.layers.13.self_attn.o_proj.weight shape: [5120, 5120]
+37: model.layers.13.self_attn.qkv_proj.weight shape: [7680, 5120]
+38: model.layers.14.input_layernorm.weight shape: [5120]
+39: model.layers.14.mlp.down_proj.weight shape: [5120, 17920]
+40: model.layers.14.mlp.gate_up_proj.weight shape: [35840, 5120]
+41: model.layers.14.post_attention_layernorm.weight shape: [5120]
+42: model.layers.14.self_attn.o_proj.weight shape: [5120, 5120]
+43: model.layers.14.self_attn.qkv_proj.weight shape: [7680, 5120]
+44: model.layers.15.input_layernorm.weight shape: [5120]
+45: model.layers.15.mlp.down_proj.weight shape: [5120, 17920]
+46: model.layers.15.mlp.gate_up_proj.weight shape: [35840, 5120]
+47: model.layers.15.post_attention_layernorm.weight shape: [5120]
+48: model.layers.15.self_attn.o_proj.weight shape: [5120, 5120]
+49: model.layers.15.self_attn.qkv_proj.weight shape: [7680, 5120]
+50: model.layers.16.input_layernorm.weight shape: [5120]
+51: model.layers.16.mlp.down_proj.weight shape: [5120, 17920]
+52: model.layers.16.mlp.gate_up_proj.weight shape: [35840, 5120]
+53: model.layers.16.post_attention_layernorm.weight shape: [5120]
+54: model.layers.16.self_attn.o_proj.weight shape: [5120, 5120]
+55: model.layers.16.self_attn.qkv_proj.weight shape: [7680, 5120]
+56: model.layers.17.input_layernorm.weight shape: [5120]
+57: model.layers.17.mlp.down_proj.weight shape: [5120, 17920]
+58: model.layers.17.mlp.gate_up_proj.weight shape: [35840, 5120]
+59: model.layers.17.post_attention_layernorm.weight shape: [5120]
+60: model.layers.17.self_attn.o_proj.weight shape: [5120, 5120]
+61: model.layers.17.self_attn.qkv_proj.weight shape: [7680, 5120]
+62: model.layers.18.input_layernorm.weight shape: [5120]
+63: model.layers.18.mlp.down_proj.weight shape: [5120, 17920]
+64: model.layers.18.mlp.gate_up_proj.weight shape: [35840, 5120]
+65: model.layers.18.post_attention_layernorm.weight shape: [5120]
+66: model.layers.18.self_attn.o_proj.weight shape: [5120, 5120]
+67: model.layers.18.self_attn.qkv_proj.weight shape: [7680, 5120]
+68: model.layers.19.input_layernorm.weight shape: [5120]
+69: model.layers.19.mlp.down_proj.weight shape: [5120, 17920]
+70: model.layers.19.mlp.gate_up_proj.weight shape: [35840, 5120]
+71: model.layers.19.post_attention_layernorm.weight shape: [5120]
+72: model.layers.19.self_attn.o_proj.weight shape: [5120, 5120]
+73: model.layers.19.self_attn.qkv_proj.weight shape: [7680, 5120]
+74: model.layers.2.input_layernorm.weight shape: [5120]
+75: model.layers.2.mlp.down_proj.weight shape: [5120, 17920]
+76: model.layers.2.mlp.gate_up_proj.weight shape: [35840, 5120]
+77: model.layers.2.post_attention_layernorm.weight shape: [5120]
+78: model.layers.2.self_attn.o_proj.weight shape: [5120, 5120]
+79: model.layers.2.self_attn.qkv_proj.weight shape: [7680, 5120]
+80: model.layers.20.input_layernorm.weight shape: [5120]
+81: model.layers.20.mlp.down_proj.weight shape: [5120, 17920]
+82: model.layers.20.mlp.gate_up_proj.weight shape: [35840, 5120]
+83: model.layers.20.post_attention_layernorm.weight shape: [5120]
+84: model.layers.20.self_attn.o_proj.weight shape: [5120, 5120]
+85: model.layers.20.self_attn.qkv_proj.weight shape: [7680, 5120]
+86: model.layers.21.input_layernorm.weight shape: [5120]
+87: model.layers.21.mlp.down_proj.weight shape: [5120, 17920]
+88: model.layers.21.mlp.gate_up_proj.weight shape: [35840, 5120]
+89: model.layers.21.post_attention_layernorm.weight shape: [5120]
+90: model.layers.21.self_attn.o_proj.weight shape: [5120, 5120]
+91: model.layers.21.self_attn.qkv_proj.weight shape: [7680, 5120]
+92: model.layers.22.input_layernorm.weight shape: [5120]
+93: model.layers.22.mlp.down_proj.weight shape: [5120, 17920]
+94: model.layers.22.mlp.gate_up_proj.weight shape: [35840, 5120]
+95: model.layers.22.post_attention_layernorm.weight shape: [5120]
+96: model.layers.22.self_attn.o_proj.weight shape: [5120, 5120]
+97: model.layers.22.self_attn.qkv_proj.weight shape: [7680, 5120]
+98: model.layers.23.input_layernorm.weight shape: [5120]
+99: model.layers.23.mlp.down_proj.weight shape: [5120, 17920]
+100: model.layers.23.mlp.gate_up_proj.weight shape: [35840, 5120]
+101: model.layers.23.post_attention_layernorm.weight shape: [5120]
+102: model.layers.23.self_attn.o_proj.weight shape: [5120, 5120]
+103: model.layers.23.self_attn.qkv_proj.weight shape: [7680, 5120]
+104: model.layers.24.input_layernorm.weight shape: [5120]
+105: model.layers.24.mlp.down_proj.weight shape: [5120, 17920]
+106: model.layers.24.mlp.gate_up_proj.weight shape: [35840, 5120]
+107: model.layers.24.post_attention_layernorm.weight shape: [5120]
+108: model.layers.24.self_attn.o_proj.weight shape: [5120, 5120]
+109: model.layers.24.self_attn.qkv_proj.weight shape: [7680, 5120]
+110: model.layers.25.input_layernorm.weight shape: [5120]
+111: model.layers.25.mlp.down_proj.weight shape: [5120, 17920]
+112: model.layers.25.mlp.gate_up_proj.weight shape: [35840, 5120]
+113: model.layers.25.post_attention_layernorm.weight shape: [5120]
+114: model.layers.25.self_attn.o_proj.weight shape: [5120, 5120]
+115: model.layers.25.self_attn.qkv_proj.weight shape: [7680, 5120]
+116: model.layers.26.input_layernorm.weight shape: [5120]
+117: model.layers.26.mlp.down_proj.weight shape: [5120, 17920]
+118: model.layers.26.mlp.gate_up_proj.weight shape: [35840, 5120]
+119: model.layers.26.post_attention_layernorm.weight shape: [5120]
+120: model.layers.26.self_attn.o_proj.weight shape: [5120, 5120]
+121: model.layers.26.self_attn.qkv_proj.weight shape: [7680, 5120]
+122: model.layers.27.input_layernorm.weight shape: [5120]
+123: model.layers.27.mlp.down_proj.weight shape: [5120, 17920]
+124: model.layers.27.mlp.gate_up_proj.weight shape: [35840, 5120]
+125: model.layers.27.post_attention_layernorm.weight shape: [5120]
+126: model.layers.27.self_attn.o_proj.weight shape: [5120, 5120]
+127: model.layers.27.self_attn.qkv_proj.weight shape: [7680, 5120]
+128: model.layers.28.input_layernorm.weight shape: [5120]
+129: model.layers.28.mlp.down_proj.weight shape: [5120, 17920]
+130: model.layers.28.mlp.gate_up_proj.weight shape: [35840, 5120]
+131: model.layers.28.post_attention_layernorm.weight shape: [5120]
+132: model.layers.28.self_attn.o_proj.weight shape: [5120, 5120]
+133: model.layers.28.self_attn.qkv_proj.weight shape: [7680, 5120]
+134: model.layers.29.input_layernorm.weight shape: [5120]
+135: model.layers.29.mlp.down_proj.weight shape: [5120, 17920]
+136: model.layers.29.mlp.gate_up_proj.weight shape: [35840, 5120]
+137: model.layers.29.post_attention_layernorm.weight shape: [5120]
+138: model.layers.29.self_attn.o_proj.weight shape: [5120, 5120]
+139: model.layers.29.self_attn.qkv_proj.weight shape: [7680, 5120]
+140: model.layers.3.input_layernorm.weight shape: [5120]
+141: model.layers.3.mlp.down_proj.weight shape: [5120, 17920]
+142: model.layers.3.mlp.gate_up_proj.weight shape: [35840, 5120]
+143: model.layers.3.post_attention_layernorm.weight shape: [5120]
+144: model.layers.3.self_attn.o_proj.weight shape: [5120, 5120]
+145: model.layers.3.self_attn.qkv_proj.weight shape: [7680, 5120]
+146: model.layers.30.input_layernorm.weight shape: [5120]
+147: model.layers.30.mlp.down_proj.weight shape: [5120, 17920]
+148: model.layers.30.mlp.gate_up_proj.weight shape: [35840, 5120]
+149: model.layers.30.post_attention_layernorm.weight shape: [5120]
+150: model.layers.30.self_attn.o_proj.weight shape: [5120, 5120]
+151: model.layers.30.self_attn.qkv_proj.weight shape: [7680, 5120]
+152: model.layers.31.input_layernorm.weight shape: [5120]
+153: model.layers.31.mlp.down_proj.weight shape: [5120, 17920]
+154: model.layers.31.mlp.gate_up_proj.weight shape: [35840, 5120]
+155: model.layers.31.post_attention_layernorm.weight shape: [5120]
+156: model.layers.31.self_attn.o_proj.weight shape: [5120, 5120]
+157: model.layers.31.self_attn.qkv_proj.weight shape: [7680, 5120]
+158: model.layers.32.input_layernorm.weight shape: [5120]
+159: model.layers.32.mlp.down_proj.weight shape: [5120, 17920]
+160: model.layers.32.mlp.gate_up_proj.weight shape: [35840, 5120]
+161: model.layers.32.post_attention_layernorm.weight shape: [5120]
+162: model.layers.32.self_attn.o_proj.weight shape: [5120, 5120]
+163: model.layers.32.self_attn.qkv_proj.weight shape: [7680, 5120]
+164: model.layers.33.input_layernorm.weight shape: [5120]
+165: model.layers.33.mlp.down_proj.weight shape: [5120, 17920]
+166: model.layers.33.mlp.gate_up_proj.weight shape: [35840, 5120]
+167: model.layers.33.post_attention_layernorm.weight shape: [5120]
+168: model.layers.33.self_attn.o_proj.weight shape: [5120, 5120]
+169: model.layers.33.self_attn.qkv_proj.weight shape: [7680, 5120]
+170: model.layers.34.input_layernorm.weight shape: [5120]
+171: model.layers.34.mlp.down_proj.weight shape: [5120, 17920]
+172: model.layers.34.mlp.gate_up_proj.weight shape: [35840, 5120]
+173: model.layers.34.post_attention_layernorm.weight shape: [5120]
+174: model.layers.34.self_attn.o_proj.weight shape: [5120, 5120]
+175: model.layers.34.self_attn.qkv_proj.weight shape: [7680, 5120]
+176: model.layers.35.input_layernorm.weight shape: [5120]
+177: model.layers.35.mlp.down_proj.weight shape: [5120, 17920]
+178: model.layers.35.mlp.gate_up_proj.weight shape: [35840, 5120]
+179: model.layers.35.post_attention_layernorm.weight shape: [5120]
+180: model.layers.35.self_attn.o_proj.weight shape: [5120, 5120]
+181: model.layers.35.self_attn.qkv_proj.weight shape: [7680, 5120]
+182: model.layers.36.input_layernorm.weight shape: [5120]
+183: model.layers.36.mlp.down_proj.weight shape: [5120, 17920]
+184: model.layers.36.mlp.gate_up_proj.weight shape: [35840, 5120]
+185: model.layers.36.post_attention_layernorm.weight shape: [5120]
+186: model.layers.36.self_attn.o_proj.weight shape: [5120, 5120]
+187: model.layers.36.self_attn.qkv_proj.weight shape: [7680, 5120]
+188: model.layers.37.input_layernorm.weight shape: [5120]
+189: model.layers.37.mlp.down_proj.weight shape: [5120, 17920]
+190: model.layers.37.mlp.gate_up_proj.weight shape: [35840, 5120]
+191: model.layers.37.post_attention_layernorm.weight shape: [5120]
+192: model.layers.37.self_attn.o_proj.weight shape: [5120, 5120]
+193: model.layers.37.self_attn.qkv_proj.weight shape: [7680, 5120]
+194: model.layers.38.input_layernorm.weight shape: [5120]
+195: model.layers.38.mlp.down_proj.weight shape: [5120, 17920]
+196: model.layers.38.mlp.gate_up_proj.weight shape: [35840, 5120]
+197: model.layers.38.post_attention_layernorm.weight shape: [5120]
+198: model.layers.38.self_attn.o_proj.weight shape: [5120, 5120]
+199: model.layers.38.self_attn.qkv_proj.weight shape: [7680, 5120]
+200: model.layers.39.input_layernorm.weight shape: [5120]
+201: model.layers.39.mlp.down_proj.weight shape: [5120, 17920]
+202: model.layers.39.mlp.gate_up_proj.weight shape: [35840, 5120]
+203: model.layers.39.post_attention_layernorm.weight shape: [5120]
+204: model.layers.39.self_attn.o_proj.weight shape: [5120, 5120]
+205: model.layers.39.self_attn.qkv_proj.weight shape: [7680, 5120]
+206: model.layers.4.input_layernorm.weight shape: [5120]
+207: model.layers.4.mlp.down_proj.weight shape: [5120, 17920]
+208: model.layers.4.mlp.gate_up_proj.weight shape: [35840, 5120]
+209: model.layers.4.post_attention_layernorm.weight shape: [5120]
+210: model.layers.4.self_attn.o_proj.weight shape: [5120, 5120]
+211: model.layers.4.self_attn.qkv_proj.weight shape: [7680, 5120]
+212: model.layers.5.input_layernorm.weight shape: [5120]
+213: model.layers.5.mlp.down_proj.weight shape: [5120, 17920]
+214: model.layers.5.mlp.gate_up_proj.weight shape: [35840, 5120]
+215: model.layers.5.post_attention_layernorm.weight shape: [5120]
+216: model.layers.5.self_attn.o_proj.weight shape: [5120, 5120]
+217: model.layers.5.self_attn.qkv_proj.weight shape: [7680, 5120]
+218: model.layers.6.input_layernorm.weight shape: [5120]
+219: model.layers.6.mlp.down_proj.weight shape: [5120, 17920]
+220: model.layers.6.mlp.gate_up_proj.weight shape: [35840, 5120]
+221: model.layers.6.post_attention_layernorm.weight shape: [5120]
+222: model.layers.6.self_attn.o_proj.weight shape: [5120, 5120]
+223: model.layers.6.self_attn.qkv_proj.weight shape: [7680, 5120]
+224: model.layers.7.input_layernorm.weight shape: [5120]
+225: model.layers.7.mlp.down_proj.weight shape: [5120, 17920]
+226: model.layers.7.mlp.gate_up_proj.weight shape: [35840, 5120]
+227: model.layers.7.post_attention_layernorm.weight shape: [5120]
+228: model.layers.7.self_attn.o_proj.weight shape: [5120, 5120]
+229: model.layers.7.self_attn.qkv_proj.weight shape: [7680, 5120]
+230: model.layers.8.input_layernorm.weight shape: [5120]
+231: model.layers.8.mlp.down_proj.weight shape: [5120, 17920]
+232: model.layers.8.mlp.gate_up_proj.weight shape: [35840, 5120]
+233: model.layers.8.post_attention_layernorm.weight shape: [5120]
+234: model.layers.8.self_attn.o_proj.weight shape: [5120, 5120]
+235: model.layers.8.self_attn.qkv_proj.weight shape: [7680, 5120]
+236: model.layers.9.input_layernorm.weight shape: [5120]
+237: model.layers.9.mlp.down_proj.weight shape: [5120, 17920]
+238: model.layers.9.mlp.gate_up_proj.weight shape: [35840, 5120]
+239: model.layers.9.post_attention_layernorm.weight shape: [5120]
+240: model.layers.9.self_attn.o_proj.weight shape: [5120, 5120]
+241: model.layers.9.self_attn.qkv_proj.weight shape: [7680, 5120]
+242: model.norm.weight shape: [5120]
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Medium4KShapeTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Medium4KShapeTest.approved.txt
new file mode 100644
index 0000000000..277f686aa7
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Medium4KShapeTest.approved.txt
@@ -0,0 +1,243 @@
+0: lm_head.weight shape: [32064, 5120]
+1: model.embed_tokens.weight shape: [32064, 5120]
+2: model.layers.0.input_layernorm.weight shape: [5120]
+3: model.layers.0.mlp.down_proj.weight shape: [5120, 17920]
+4: model.layers.0.mlp.gate_up_proj.weight shape: [35840, 5120]
+5: model.layers.0.post_attention_layernorm.weight shape: [5120]
+6: model.layers.0.self_attn.o_proj.weight shape: [5120, 5120]
+7: model.layers.0.self_attn.qkv_proj.weight shape: [7680, 5120]
+8: model.layers.1.input_layernorm.weight shape: [5120]
+9: model.layers.1.mlp.down_proj.weight shape: [5120, 17920]
+10: model.layers.1.mlp.gate_up_proj.weight shape: [35840, 5120]
+11: model.layers.1.post_attention_layernorm.weight shape: [5120]
+12: model.layers.1.self_attn.o_proj.weight shape: [5120, 5120]
+13: model.layers.1.self_attn.qkv_proj.weight shape: [7680, 5120]
+14: model.layers.10.input_layernorm.weight shape: [5120]
+15: model.layers.10.mlp.down_proj.weight shape: [5120, 17920]
+16: model.layers.10.mlp.gate_up_proj.weight shape: [35840, 5120]
+17: model.layers.10.post_attention_layernorm.weight shape: [5120]
+18: model.layers.10.self_attn.o_proj.weight shape: [5120, 5120]
+19: model.layers.10.self_attn.qkv_proj.weight shape: [7680, 5120]
+20: model.layers.11.input_layernorm.weight shape: [5120]
+21: model.layers.11.mlp.down_proj.weight shape: [5120, 17920]
+22: model.layers.11.mlp.gate_up_proj.weight shape: [35840, 5120]
+23: model.layers.11.post_attention_layernorm.weight shape: [5120]
+24: model.layers.11.self_attn.o_proj.weight shape: [5120, 5120]
+25: model.layers.11.self_attn.qkv_proj.weight shape: [7680, 5120]
+26: model.layers.12.input_layernorm.weight shape: [5120]
+27: model.layers.12.mlp.down_proj.weight shape: [5120, 17920]
+28: model.layers.12.mlp.gate_up_proj.weight shape: [35840, 5120]
+29: model.layers.12.post_attention_layernorm.weight shape: [5120]
+30: model.layers.12.self_attn.o_proj.weight shape: [5120, 5120]
+31: model.layers.12.self_attn.qkv_proj.weight shape: [7680, 5120]
+32: model.layers.13.input_layernorm.weight shape: [5120]
+33: model.layers.13.mlp.down_proj.weight shape: [5120, 17920]
+34: model.layers.13.mlp.gate_up_proj.weight shape: [35840, 5120]
+35: model.layers.13.post_attention_layernorm.weight shape: [5120]
+36: model.layers.13.self_attn.o_proj.weight shape: [5120, 5120]
+37: model.layers.13.self_attn.qkv_proj.weight shape: [7680, 5120]
+38: model.layers.14.input_layernorm.weight shape: [5120]
+39: model.layers.14.mlp.down_proj.weight shape: [5120, 17920]
+40: model.layers.14.mlp.gate_up_proj.weight shape: [35840, 5120]
+41: model.layers.14.post_attention_layernorm.weight shape: [5120]
+42: model.layers.14.self_attn.o_proj.weight shape: [5120, 5120]
+43: model.layers.14.self_attn.qkv_proj.weight shape: [7680, 5120]
+44: model.layers.15.input_layernorm.weight shape: [5120]
+45: model.layers.15.mlp.down_proj.weight shape: [5120, 17920]
+46: model.layers.15.mlp.gate_up_proj.weight shape: [35840, 5120]
+47: model.layers.15.post_attention_layernorm.weight shape: [5120]
+48: model.layers.15.self_attn.o_proj.weight shape: [5120, 5120]
+49: model.layers.15.self_attn.qkv_proj.weight shape: [7680, 5120]
+50: model.layers.16.input_layernorm.weight shape: [5120]
+51: model.layers.16.mlp.down_proj.weight shape: [5120, 17920]
+52: model.layers.16.mlp.gate_up_proj.weight shape: [35840, 5120]
+53: model.layers.16.post_attention_layernorm.weight shape: [5120]
+54: model.layers.16.self_attn.o_proj.weight shape: [5120, 5120]
+55: model.layers.16.self_attn.qkv_proj.weight shape: [7680, 5120]
+56: model.layers.17.input_layernorm.weight shape: [5120]
+57: model.layers.17.mlp.down_proj.weight shape: [5120, 17920]
+58: model.layers.17.mlp.gate_up_proj.weight shape: [35840, 5120]
+59: model.layers.17.post_attention_layernorm.weight shape: [5120]
+60: model.layers.17.self_attn.o_proj.weight shape: [5120, 5120]
+61: model.layers.17.self_attn.qkv_proj.weight shape: [7680, 5120]
+62: model.layers.18.input_layernorm.weight shape: [5120]
+63: model.layers.18.mlp.down_proj.weight shape: [5120, 17920]
+64: model.layers.18.mlp.gate_up_proj.weight shape: [35840, 5120]
+65: model.layers.18.post_attention_layernorm.weight shape: [5120]
+66: model.layers.18.self_attn.o_proj.weight shape: [5120, 5120]
+67: model.layers.18.self_attn.qkv_proj.weight shape: [7680, 5120]
+68: model.layers.19.input_layernorm.weight shape: [5120]
+69: model.layers.19.mlp.down_proj.weight shape: [5120, 17920]
+70: model.layers.19.mlp.gate_up_proj.weight shape: [35840, 5120]
+71: model.layers.19.post_attention_layernorm.weight shape: [5120]
+72: model.layers.19.self_attn.o_proj.weight shape: [5120, 5120]
+73: model.layers.19.self_attn.qkv_proj.weight shape: [7680, 5120]
+74: model.layers.2.input_layernorm.weight shape: [5120]
+75: model.layers.2.mlp.down_proj.weight shape: [5120, 17920]
+76: model.layers.2.mlp.gate_up_proj.weight shape: [35840, 5120]
+77: model.layers.2.post_attention_layernorm.weight shape: [5120]
+78: model.layers.2.self_attn.o_proj.weight shape: [5120, 5120]
+79: model.layers.2.self_attn.qkv_proj.weight shape: [7680, 5120]
+80: model.layers.20.input_layernorm.weight shape: [5120]
+81: model.layers.20.mlp.down_proj.weight shape: [5120, 17920]
+82: model.layers.20.mlp.gate_up_proj.weight shape: [35840, 5120]
+83: model.layers.20.post_attention_layernorm.weight shape: [5120]
+84: model.layers.20.self_attn.o_proj.weight shape: [5120, 5120]
+85: model.layers.20.self_attn.qkv_proj.weight shape: [7680, 5120]
+86: model.layers.21.input_layernorm.weight shape: [5120]
+87: model.layers.21.mlp.down_proj.weight shape: [5120, 17920]
+88: model.layers.21.mlp.gate_up_proj.weight shape: [35840, 5120]
+89: model.layers.21.post_attention_layernorm.weight shape: [5120]
+90: model.layers.21.self_attn.o_proj.weight shape: [5120, 5120]
+91: model.layers.21.self_attn.qkv_proj.weight shape: [7680, 5120]
+92: model.layers.22.input_layernorm.weight shape: [5120]
+93: model.layers.22.mlp.down_proj.weight shape: [5120, 17920]
+94: model.layers.22.mlp.gate_up_proj.weight shape: [35840, 5120]
+95: model.layers.22.post_attention_layernorm.weight shape: [5120]
+96: model.layers.22.self_attn.o_proj.weight shape: [5120, 5120]
+97: model.layers.22.self_attn.qkv_proj.weight shape: [7680, 5120]
+98: model.layers.23.input_layernorm.weight shape: [5120]
+99: model.layers.23.mlp.down_proj.weight shape: [5120, 17920]
+100: model.layers.23.mlp.gate_up_proj.weight shape: [35840, 5120]
+101: model.layers.23.post_attention_layernorm.weight shape: [5120]
+102: model.layers.23.self_attn.o_proj.weight shape: [5120, 5120]
+103: model.layers.23.self_attn.qkv_proj.weight shape: [7680, 5120]
+104: model.layers.24.input_layernorm.weight shape: [5120]
+105: model.layers.24.mlp.down_proj.weight shape: [5120, 17920]
+106: model.layers.24.mlp.gate_up_proj.weight shape: [35840, 5120]
+107: model.layers.24.post_attention_layernorm.weight shape: [5120]
+108: model.layers.24.self_attn.o_proj.weight shape: [5120, 5120]
+109: model.layers.24.self_attn.qkv_proj.weight shape: [7680, 5120]
+110: model.layers.25.input_layernorm.weight shape: [5120]
+111: model.layers.25.mlp.down_proj.weight shape: [5120, 17920]
+112: model.layers.25.mlp.gate_up_proj.weight shape: [35840, 5120]
+113: model.layers.25.post_attention_layernorm.weight shape: [5120]
+114: model.layers.25.self_attn.o_proj.weight shape: [5120, 5120]
+115: model.layers.25.self_attn.qkv_proj.weight shape: [7680, 5120]
+116: model.layers.26.input_layernorm.weight shape: [5120]
+117: model.layers.26.mlp.down_proj.weight shape: [5120, 17920]
+118: model.layers.26.mlp.gate_up_proj.weight shape: [35840, 5120]
+119: model.layers.26.post_attention_layernorm.weight shape: [5120]
+120: model.layers.26.self_attn.o_proj.weight shape: [5120, 5120]
+121: model.layers.26.self_attn.qkv_proj.weight shape: [7680, 5120]
+122: model.layers.27.input_layernorm.weight shape: [5120]
+123: model.layers.27.mlp.down_proj.weight shape: [5120, 17920]
+124: model.layers.27.mlp.gate_up_proj.weight shape: [35840, 5120]
+125: model.layers.27.post_attention_layernorm.weight shape: [5120]
+126: model.layers.27.self_attn.o_proj.weight shape: [5120, 5120]
+127: model.layers.27.self_attn.qkv_proj.weight shape: [7680, 5120]
+128: model.layers.28.input_layernorm.weight shape: [5120]
+129: model.layers.28.mlp.down_proj.weight shape: [5120, 17920]
+130: model.layers.28.mlp.gate_up_proj.weight shape: [35840, 5120]
+131: model.layers.28.post_attention_layernorm.weight shape: [5120]
+132: model.layers.28.self_attn.o_proj.weight shape: [5120, 5120]
+133: model.layers.28.self_attn.qkv_proj.weight shape: [7680, 5120]
+134: model.layers.29.input_layernorm.weight shape: [5120]
+135: model.layers.29.mlp.down_proj.weight shape: [5120, 17920]
+136: model.layers.29.mlp.gate_up_proj.weight shape: [35840, 5120]
+137: model.layers.29.post_attention_layernorm.weight shape: [5120]
+138: model.layers.29.self_attn.o_proj.weight shape: [5120, 5120]
+139: model.layers.29.self_attn.qkv_proj.weight shape: [7680, 5120]
+140: model.layers.3.input_layernorm.weight shape: [5120]
+141: model.layers.3.mlp.down_proj.weight shape: [5120, 17920]
+142: model.layers.3.mlp.gate_up_proj.weight shape: [35840, 5120]
+143: model.layers.3.post_attention_layernorm.weight shape: [5120]
+144: model.layers.3.self_attn.o_proj.weight shape: [5120, 5120]
+145: model.layers.3.self_attn.qkv_proj.weight shape: [7680, 5120]
+146: model.layers.30.input_layernorm.weight shape: [5120]
+147: model.layers.30.mlp.down_proj.weight shape: [5120, 17920]
+148: model.layers.30.mlp.gate_up_proj.weight shape: [35840, 5120]
+149: model.layers.30.post_attention_layernorm.weight shape: [5120]
+150: model.layers.30.self_attn.o_proj.weight shape: [5120, 5120]
+151: model.layers.30.self_attn.qkv_proj.weight shape: [7680, 5120]
+152: model.layers.31.input_layernorm.weight shape: [5120]
+153: model.layers.31.mlp.down_proj.weight shape: [5120, 17920]
+154: model.layers.31.mlp.gate_up_proj.weight shape: [35840, 5120]
+155: model.layers.31.post_attention_layernorm.weight shape: [5120]
+156: model.layers.31.self_attn.o_proj.weight shape: [5120, 5120]
+157: model.layers.31.self_attn.qkv_proj.weight shape: [7680, 5120]
+158: model.layers.32.input_layernorm.weight shape: [5120]
+159: model.layers.32.mlp.down_proj.weight shape: [5120, 17920]
+160: model.layers.32.mlp.gate_up_proj.weight shape: [35840, 5120]
+161: model.layers.32.post_attention_layernorm.weight shape: [5120]
+162: model.layers.32.self_attn.o_proj.weight shape: [5120, 5120]
+163: model.layers.32.self_attn.qkv_proj.weight shape: [7680, 5120]
+164: model.layers.33.input_layernorm.weight shape: [5120]
+165: model.layers.33.mlp.down_proj.weight shape: [5120, 17920]
+166: model.layers.33.mlp.gate_up_proj.weight shape: [35840, 5120]
+167: model.layers.33.post_attention_layernorm.weight shape: [5120]
+168: model.layers.33.self_attn.o_proj.weight shape: [5120, 5120]
+169: model.layers.33.self_attn.qkv_proj.weight shape: [7680, 5120]
+170: model.layers.34.input_layernorm.weight shape: [5120]
+171: model.layers.34.mlp.down_proj.weight shape: [5120, 17920]
+172: model.layers.34.mlp.gate_up_proj.weight shape: [35840, 5120]
+173: model.layers.34.post_attention_layernorm.weight shape: [5120]
+174: model.layers.34.self_attn.o_proj.weight shape: [5120, 5120]
+175: model.layers.34.self_attn.qkv_proj.weight shape: [7680, 5120]
+176: model.layers.35.input_layernorm.weight shape: [5120]
+177: model.layers.35.mlp.down_proj.weight shape: [5120, 17920]
+178: model.layers.35.mlp.gate_up_proj.weight shape: [35840, 5120]
+179: model.layers.35.post_attention_layernorm.weight shape: [5120]
+180: model.layers.35.self_attn.o_proj.weight shape: [5120, 5120]
+181: model.layers.35.self_attn.qkv_proj.weight shape: [7680, 5120]
+182: model.layers.36.input_layernorm.weight shape: [5120]
+183: model.layers.36.mlp.down_proj.weight shape: [5120, 17920]
+184: model.layers.36.mlp.gate_up_proj.weight shape: [35840, 5120]
+185: model.layers.36.post_attention_layernorm.weight shape: [5120]
+186: model.layers.36.self_attn.o_proj.weight shape: [5120, 5120]
+187: model.layers.36.self_attn.qkv_proj.weight shape: [7680, 5120]
+188: model.layers.37.input_layernorm.weight shape: [5120]
+189: model.layers.37.mlp.down_proj.weight shape: [5120, 17920]
+190: model.layers.37.mlp.gate_up_proj.weight shape: [35840, 5120]
+191: model.layers.37.post_attention_layernorm.weight shape: [5120]
+192: model.layers.37.self_attn.o_proj.weight shape: [5120, 5120]
+193: model.layers.37.self_attn.qkv_proj.weight shape: [7680, 5120]
+194: model.layers.38.input_layernorm.weight shape: [5120]
+195: model.layers.38.mlp.down_proj.weight shape: [5120, 17920]
+196: model.layers.38.mlp.gate_up_proj.weight shape: [35840, 5120]
+197: model.layers.38.post_attention_layernorm.weight shape: [5120]
+198: model.layers.38.self_attn.o_proj.weight shape: [5120, 5120]
+199: model.layers.38.self_attn.qkv_proj.weight shape: [7680, 5120]
+200: model.layers.39.input_layernorm.weight shape: [5120]
+201: model.layers.39.mlp.down_proj.weight shape: [5120, 17920]
+202: model.layers.39.mlp.gate_up_proj.weight shape: [35840, 5120]
+203: model.layers.39.post_attention_layernorm.weight shape: [5120]
+204: model.layers.39.self_attn.o_proj.weight shape: [5120, 5120]
+205: model.layers.39.self_attn.qkv_proj.weight shape: [7680, 5120]
+206: model.layers.4.input_layernorm.weight shape: [5120]
+207: model.layers.4.mlp.down_proj.weight shape: [5120, 17920]
+208: model.layers.4.mlp.gate_up_proj.weight shape: [35840, 5120]
+209: model.layers.4.post_attention_layernorm.weight shape: [5120]
+210: model.layers.4.self_attn.o_proj.weight shape: [5120, 5120]
+211: model.layers.4.self_attn.qkv_proj.weight shape: [7680, 5120]
+212: model.layers.5.input_layernorm.weight shape: [5120]
+213: model.layers.5.mlp.down_proj.weight shape: [5120, 17920]
+214: model.layers.5.mlp.gate_up_proj.weight shape: [35840, 5120]
+215: model.layers.5.post_attention_layernorm.weight shape: [5120]
+216: model.layers.5.self_attn.o_proj.weight shape: [5120, 5120]
+217: model.layers.5.self_attn.qkv_proj.weight shape: [7680, 5120]
+218: model.layers.6.input_layernorm.weight shape: [5120]
+219: model.layers.6.mlp.down_proj.weight shape: [5120, 17920]
+220: model.layers.6.mlp.gate_up_proj.weight shape: [35840, 5120]
+221: model.layers.6.post_attention_layernorm.weight shape: [5120]
+222: model.layers.6.self_attn.o_proj.weight shape: [5120, 5120]
+223: model.layers.6.self_attn.qkv_proj.weight shape: [7680, 5120]
+224: model.layers.7.input_layernorm.weight shape: [5120]
+225: model.layers.7.mlp.down_proj.weight shape: [5120, 17920]
+226: model.layers.7.mlp.gate_up_proj.weight shape: [35840, 5120]
+227: model.layers.7.post_attention_layernorm.weight shape: [5120]
+228: model.layers.7.self_attn.o_proj.weight shape: [5120, 5120]
+229: model.layers.7.self_attn.qkv_proj.weight shape: [7680, 5120]
+230: model.layers.8.input_layernorm.weight shape: [5120]
+231: model.layers.8.mlp.down_proj.weight shape: [5120, 17920]
+232: model.layers.8.mlp.gate_up_proj.weight shape: [35840, 5120]
+233: model.layers.8.post_attention_layernorm.weight shape: [5120]
+234: model.layers.8.self_attn.o_proj.weight shape: [5120, 5120]
+235: model.layers.8.self_attn.qkv_proj.weight shape: [7680, 5120]
+236: model.layers.9.input_layernorm.weight shape: [5120]
+237: model.layers.9.mlp.down_proj.weight shape: [5120, 17920]
+238: model.layers.9.mlp.gate_up_proj.weight shape: [35840, 5120]
+239: model.layers.9.post_attention_layernorm.weight shape: [5120]
+240: model.layers.9.self_attn.o_proj.weight shape: [5120, 5120]
+241: model.layers.9.self_attn.qkv_proj.weight shape: [7680, 5120]
+242: model.norm.weight shape: [5120]
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini128KLayerSizeTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini128KLayerSizeTest.approved.txt
new file mode 100644
index 0000000000..4f711239a7
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini128KLayerSizeTest.approved.txt
@@ -0,0 +1,34 @@
+{
+  "model.layers.0": 216,
+  "model.layers.1": 216,
+  "model.layers.2": 216,
+  "model.layers.3": 216,
+  "model.layers.4": 216,
+  "model.layers.5": 216,
+  "model.layers.6": 216,
+  "model.layers.7": 216,
+  "model.layers.8": 216,
+  "model.layers.9": 216,
+  "model.layers.10": 216,
+  "model.layers.11": 216,
+  "model.layers.12": 216,
+  "model.layers.13": 216,
+  "model.layers.14": 216,
+  "model.layers.15": 216,
+  "model.layers.16": 216,
+  "model.layers.17": 216,
+  "model.layers.18": 216,
+  "model.layers.19": 216,
+  "model.layers.20": 216,
+  "model.layers.21": 216,
+  "model.layers.22": 216,
+  "model.layers.23": 216,
+  "model.layers.24": 216,
+  "model.layers.25": 216,
+  "model.layers.26": 216,
+  "model.layers.27": 216,
+  "model.layers.28": 216,
+  "model.layers.29": 216,
+  "model.layers.30": 216,
+  "model.layers.31": 216
+}
\ No newline at end of file
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini128KShapeTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini128KShapeTest.approved.txt
new file mode 100644
index 0000000000..2278f3b67d
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini128KShapeTest.approved.txt
@@ -0,0 +1,195 @@
+0: lm_head.weight shape: [32064, 3072]
+1: model.embed_tokens.weight shape: [32064, 3072]
+2: model.layers.0.input_layernorm.weight shape: [3072]
+3: model.layers.0.mlp.down_proj.weight shape: [3072, 8192]
+4: model.layers.0.mlp.gate_up_proj.weight shape: [16384, 3072]
+5: model.layers.0.post_attention_layernorm.weight shape: [3072]
+6: model.layers.0.self_attn.o_proj.weight shape: [3072, 3072]
+7: model.layers.0.self_attn.qkv_proj.weight shape: [9216, 3072]
+8: model.layers.1.input_layernorm.weight shape: [3072]
+9: model.layers.1.mlp.down_proj.weight shape: [3072, 8192]
+10: model.layers.1.mlp.gate_up_proj.weight shape: [16384, 3072]
+11: model.layers.1.post_attention_layernorm.weight shape: [3072]
+12: model.layers.1.self_attn.o_proj.weight shape: [3072, 3072]
+13: model.layers.1.self_attn.qkv_proj.weight shape: [9216, 3072]
+14: model.layers.10.input_layernorm.weight shape: [3072]
+15: model.layers.10.mlp.down_proj.weight shape: [3072, 8192]
+16: model.layers.10.mlp.gate_up_proj.weight shape: [16384, 3072]
+17: model.layers.10.post_attention_layernorm.weight shape: [3072]
+18: model.layers.10.self_attn.o_proj.weight shape: [3072, 3072]
+19: model.layers.10.self_attn.qkv_proj.weight shape: [9216, 3072]
+20: model.layers.11.input_layernorm.weight shape: [3072]
+21: model.layers.11.mlp.down_proj.weight shape: [3072, 8192]
+22: model.layers.11.mlp.gate_up_proj.weight shape: [16384, 3072]
+23: model.layers.11.post_attention_layernorm.weight shape: [3072]
+24: model.layers.11.self_attn.o_proj.weight shape: [3072, 3072]
+25: model.layers.11.self_attn.qkv_proj.weight shape: [9216, 3072]
+26: model.layers.12.input_layernorm.weight shape: [3072]
+27: model.layers.12.mlp.down_proj.weight shape: [3072, 8192]
+28: model.layers.12.mlp.gate_up_proj.weight shape: [16384, 3072]
+29: model.layers.12.post_attention_layernorm.weight shape: [3072]
+30: model.layers.12.self_attn.o_proj.weight shape: [3072, 3072]
+31: model.layers.12.self_attn.qkv_proj.weight shape: [9216, 3072]
+32: model.layers.13.input_layernorm.weight shape: [3072]
+33: model.layers.13.mlp.down_proj.weight shape: [3072, 8192]
+34: model.layers.13.mlp.gate_up_proj.weight shape: [16384, 3072]
+35: model.layers.13.post_attention_layernorm.weight shape: [3072]
+36: model.layers.13.self_attn.o_proj.weight shape: [3072, 3072]
+37: model.layers.13.self_attn.qkv_proj.weight shape: [9216, 3072]
+38: model.layers.14.input_layernorm.weight shape: [3072]
+39: model.layers.14.mlp.down_proj.weight shape: [3072, 8192]
+40: model.layers.14.mlp.gate_up_proj.weight shape: [16384, 3072]
+41: model.layers.14.post_attention_layernorm.weight shape: [3072]
+42: model.layers.14.self_attn.o_proj.weight shape: [3072, 3072]
+43: model.layers.14.self_attn.qkv_proj.weight shape: [9216, 3072]
+44: model.layers.15.input_layernorm.weight shape: [3072]
+45: model.layers.15.mlp.down_proj.weight shape: [3072, 8192]
+46: model.layers.15.mlp.gate_up_proj.weight shape: [16384, 3072]
+47: model.layers.15.post_attention_layernorm.weight shape: [3072]
+48: model.layers.15.self_attn.o_proj.weight shape: [3072, 3072]
+49: model.layers.15.self_attn.qkv_proj.weight shape: [9216, 3072]
+50: model.layers.16.input_layernorm.weight shape: [3072]
+51: model.layers.16.mlp.down_proj.weight shape: [3072, 8192]
+52: model.layers.16.mlp.gate_up_proj.weight shape: [16384, 3072]
+53: model.layers.16.post_attention_layernorm.weight shape: [3072]
+54: model.layers.16.self_attn.o_proj.weight shape: [3072, 3072]
+55: model.layers.16.self_attn.qkv_proj.weight shape: [9216, 3072]
+56: model.layers.17.input_layernorm.weight shape: [3072]
+57: model.layers.17.mlp.down_proj.weight shape: [3072, 8192]
+58: model.layers.17.mlp.gate_up_proj.weight shape: [16384, 3072]
+59: model.layers.17.post_attention_layernorm.weight shape: [3072]
+60: model.layers.17.self_attn.o_proj.weight shape: [3072, 3072]
+61: model.layers.17.self_attn.qkv_proj.weight shape: [9216, 3072]
+62: model.layers.18.input_layernorm.weight shape: [3072]
+63: model.layers.18.mlp.down_proj.weight shape: [3072, 8192]
+64: model.layers.18.mlp.gate_up_proj.weight shape: [16384, 3072]
+65: model.layers.18.post_attention_layernorm.weight shape: [3072]
+66: model.layers.18.self_attn.o_proj.weight shape: [3072, 3072]
+67: model.layers.18.self_attn.qkv_proj.weight shape: [9216, 3072]
+68: model.layers.19.input_layernorm.weight shape: [3072]
+69: model.layers.19.mlp.down_proj.weight shape: [3072, 8192]
+70: model.layers.19.mlp.gate_up_proj.weight shape: [16384, 3072]
+71: model.layers.19.post_attention_layernorm.weight shape: [3072]
+72: model.layers.19.self_attn.o_proj.weight shape: [3072, 3072]
+73: model.layers.19.self_attn.qkv_proj.weight shape: [9216, 3072]
+74: model.layers.2.input_layernorm.weight shape: [3072]
+75: model.layers.2.mlp.down_proj.weight shape: [3072, 8192]
+76: model.layers.2.mlp.gate_up_proj.weight shape: [16384, 3072]
+77: model.layers.2.post_attention_layernorm.weight shape: [3072]
+78: model.layers.2.self_attn.o_proj.weight shape: [3072, 3072]
+79: model.layers.2.self_attn.qkv_proj.weight shape: [9216, 3072]
+80: model.layers.20.input_layernorm.weight shape: [3072]
+81: model.layers.20.mlp.down_proj.weight shape: [3072, 8192]
+82: model.layers.20.mlp.gate_up_proj.weight shape: [16384, 3072]
+83: model.layers.20.post_attention_layernorm.weight shape: [3072]
+84: model.layers.20.self_attn.o_proj.weight shape: [3072, 3072]
+85: model.layers.20.self_attn.qkv_proj.weight shape: [9216, 3072]
+86: model.layers.21.input_layernorm.weight shape: [3072]
+87: model.layers.21.mlp.down_proj.weight shape: [3072, 8192]
+88: model.layers.21.mlp.gate_up_proj.weight shape: [16384, 3072]
+89: model.layers.21.post_attention_layernorm.weight shape: [3072]
+90: model.layers.21.self_attn.o_proj.weight shape: [3072, 3072]
+91: model.layers.21.self_attn.qkv_proj.weight shape: [9216, 3072]
+92: model.layers.22.input_layernorm.weight shape: [3072]
+93: model.layers.22.mlp.down_proj.weight shape: [3072, 8192]
+94: model.layers.22.mlp.gate_up_proj.weight shape: [16384, 3072]
+95: model.layers.22.post_attention_layernorm.weight shape: [3072]
+96: model.layers.22.self_attn.o_proj.weight shape: [3072, 3072]
+97: model.layers.22.self_attn.qkv_proj.weight shape: [9216, 3072]
+98: model.layers.23.input_layernorm.weight shape: [3072]
+99: model.layers.23.mlp.down_proj.weight shape: [3072, 8192]
+100: model.layers.23.mlp.gate_up_proj.weight shape: [16384, 3072]
+101: model.layers.23.post_attention_layernorm.weight shape: [3072]
+102: model.layers.23.self_attn.o_proj.weight shape: [3072, 3072]
+103: model.layers.23.self_attn.qkv_proj.weight shape: [9216, 3072]
+104: model.layers.24.input_layernorm.weight shape: [3072]
+105: model.layers.24.mlp.down_proj.weight shape: [3072, 8192]
+106: model.layers.24.mlp.gate_up_proj.weight shape: [16384, 3072]
+107: model.layers.24.post_attention_layernorm.weight shape: [3072]
+108: model.layers.24.self_attn.o_proj.weight shape: [3072, 3072]
+109: model.layers.24.self_attn.qkv_proj.weight shape: [9216, 3072]
+110: model.layers.25.input_layernorm.weight shape: [3072]
+111: model.layers.25.mlp.down_proj.weight shape: [3072, 8192]
+112: model.layers.25.mlp.gate_up_proj.weight shape: [16384, 3072]
+113: model.layers.25.post_attention_layernorm.weight shape: [3072]
+114: model.layers.25.self_attn.o_proj.weight shape: [3072, 3072]
+115: model.layers.25.self_attn.qkv_proj.weight shape: [9216, 3072]
+116: model.layers.26.input_layernorm.weight shape: [3072]
+117: model.layers.26.mlp.down_proj.weight shape: [3072, 8192]
+118: model.layers.26.mlp.gate_up_proj.weight shape: [16384, 3072]
+119: model.layers.26.post_attention_layernorm.weight shape: [3072]
+120: model.layers.26.self_attn.o_proj.weight shape: [3072, 3072]
+121: model.layers.26.self_attn.qkv_proj.weight shape: [9216, 3072]
+122: model.layers.27.input_layernorm.weight shape: [3072]
+123: model.layers.27.mlp.down_proj.weight shape: [3072, 8192]
+124: model.layers.27.mlp.gate_up_proj.weight shape: [16384, 3072]
+125: model.layers.27.post_attention_layernorm.weight shape: [3072]
+126: model.layers.27.self_attn.o_proj.weight shape: [3072, 3072]
+127: model.layers.27.self_attn.qkv_proj.weight shape: [9216, 3072]
+128: model.layers.28.input_layernorm.weight shape: [3072]
+129: model.layers.28.mlp.down_proj.weight shape: [3072, 8192]
+130: model.layers.28.mlp.gate_up_proj.weight shape: [16384, 3072]
+131: model.layers.28.post_attention_layernorm.weight shape: [3072]
+132: model.layers.28.self_attn.o_proj.weight shape: [3072, 3072]
+133: model.layers.28.self_attn.qkv_proj.weight shape: [9216, 3072]
+134: model.layers.29.input_layernorm.weight shape: [3072]
+135: model.layers.29.mlp.down_proj.weight shape: [3072, 8192]
+136: model.layers.29.mlp.gate_up_proj.weight shape: [16384, 3072]
+137: model.layers.29.post_attention_layernorm.weight shape: [3072]
+138: model.layers.29.self_attn.o_proj.weight shape: [3072, 3072]
+139: model.layers.29.self_attn.qkv_proj.weight shape: [9216, 3072]
+140: model.layers.3.input_layernorm.weight shape: [3072]
+141: model.layers.3.mlp.down_proj.weight shape: [3072, 8192]
+142: model.layers.3.mlp.gate_up_proj.weight shape: [16384, 3072]
+143: model.layers.3.post_attention_layernorm.weight shape: [3072]
+144: model.layers.3.self_attn.o_proj.weight shape: [3072, 3072]
+145: model.layers.3.self_attn.qkv_proj.weight shape: [9216, 3072]
+146: model.layers.30.input_layernorm.weight shape: [3072]
+147: model.layers.30.mlp.down_proj.weight shape: [3072, 8192]
+148: model.layers.30.mlp.gate_up_proj.weight shape: [16384, 3072]
+149: model.layers.30.post_attention_layernorm.weight shape: [3072]
+150: model.layers.30.self_attn.o_proj.weight shape: [3072, 3072]
+151: model.layers.30.self_attn.qkv_proj.weight shape: [9216, 3072]
+152: model.layers.31.input_layernorm.weight shape: [3072]
+153: model.layers.31.mlp.down_proj.weight shape: [3072, 8192]
+154: model.layers.31.mlp.gate_up_proj.weight shape: [16384, 3072]
+155: model.layers.31.post_attention_layernorm.weight shape: [3072]
+156: model.layers.31.self_attn.o_proj.weight shape: [3072, 3072]
+157: model.layers.31.self_attn.qkv_proj.weight shape: [9216, 3072]
+158: model.layers.4.input_layernorm.weight shape: [3072]
+159: model.layers.4.mlp.down_proj.weight shape: [3072, 8192]
+160: model.layers.4.mlp.gate_up_proj.weight shape: [16384, 3072]
+161: model.layers.4.post_attention_layernorm.weight shape: [3072]
+162: model.layers.4.self_attn.o_proj.weight shape: [3072, 3072]
+163: model.layers.4.self_attn.qkv_proj.weight shape: [9216, 3072]
+164: model.layers.5.input_layernorm.weight shape: [3072]
+165: model.layers.5.mlp.down_proj.weight shape: [3072, 8192]
+166: model.layers.5.mlp.gate_up_proj.weight shape: [16384, 3072]
+167: model.layers.5.post_attention_layernorm.weight shape: [3072]
+168: model.layers.5.self_attn.o_proj.weight shape: [3072, 3072]
+169: model.layers.5.self_attn.qkv_proj.weight shape: [9216, 3072]
+170: model.layers.6.input_layernorm.weight shape: [3072]
+171: model.layers.6.mlp.down_proj.weight shape: [3072, 8192]
+172: model.layers.6.mlp.gate_up_proj.weight shape: [16384, 3072]
+173: model.layers.6.post_attention_layernorm.weight shape: [3072]
+174: model.layers.6.self_attn.o_proj.weight shape: [3072, 3072]
+175: model.layers.6.self_attn.qkv_proj.weight shape: [9216, 3072]
+176: model.layers.7.input_layernorm.weight shape: [3072]
+177: model.layers.7.mlp.down_proj.weight shape: [3072, 8192]
+178: model.layers.7.mlp.gate_up_proj.weight shape: [16384, 3072]
+179: model.layers.7.post_attention_layernorm.weight shape: [3072]
+180: model.layers.7.self_attn.o_proj.weight shape: [3072, 3072]
+181: model.layers.7.self_attn.qkv_proj.weight shape: [9216, 3072]
+182: model.layers.8.input_layernorm.weight shape: [3072]
+183: model.layers.8.mlp.down_proj.weight shape: [3072, 8192]
+184: model.layers.8.mlp.gate_up_proj.weight shape: [16384, 3072]
+185: model.layers.8.post_attention_layernorm.weight shape: [3072]
+186: model.layers.8.self_attn.o_proj.weight shape: [3072, 3072]
+187: model.layers.8.self_attn.qkv_proj.weight shape: [9216, 3072]
+188: model.layers.9.input_layernorm.weight shape: [3072]
+189: model.layers.9.mlp.down_proj.weight shape: [3072, 8192]
+190: model.layers.9.mlp.gate_up_proj.weight shape: [16384, 3072]
+191: model.layers.9.post_attention_layernorm.weight shape: [3072]
+192: model.layers.9.self_attn.o_proj.weight shape: [3072, 3072]
+193: model.layers.9.self_attn.qkv_proj.weight shape: [9216, 3072]
+194: model.norm.weight shape: [3072]
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini4KInt4QuantizeShapeTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini4KInt4QuantizeShapeTest.approved.txt
new file mode 100644
index 0000000000..1855e6396e
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini4KInt4QuantizeShapeTest.approved.txt
@@ -0,0 +1,451 @@
+0: lm_head.weight shape: [32064, 3072]
+1: model.embed_tokens.weight shape: [32064, 3072]
+2: model.layers.0.input_layernorm.weight shape: [3072]
+3: model.layers.0.mlp.down_proj.4bit_weight shape: [12582912]
+4: model.layers.0.mlp.down_proj.scale shape: [3072]
+5: model.layers.0.mlp.down_proj.zeroPoint shape: [1536]
+6: model.layers.0.mlp.gate_up_proj.4bit_weight shape: [25165824]
+7: model.layers.0.mlp.gate_up_proj.scale shape: [16384]
+8: model.layers.0.mlp.gate_up_proj.zeroPoint shape: [8192]
+9: model.layers.0.post_attention_layernorm.weight shape: [3072]
+10: model.layers.0.self_attn.o_proj.4bit_weight shape: [4718592]
+11: model.layers.0.self_attn.o_proj.scale shape: [3072]
+12: model.layers.0.self_attn.o_proj.zeroPoint shape: [1536]
+13: model.layers.0.self_attn.qkv_proj.4bit_weight shape: [14155776]
+14: model.layers.0.self_attn.qkv_proj.scale shape: [9216]
+15: model.layers.0.self_attn.qkv_proj.zeroPoint shape: [4608]
+16: model.layers.1.input_layernorm.weight shape: [3072]
+17: model.layers.1.mlp.down_proj.4bit_weight shape: [12582912]
+18: model.layers.1.mlp.down_proj.scale shape: [3072]
+19: model.layers.1.mlp.down_proj.zeroPoint shape: [1536]
+20: model.layers.1.mlp.gate_up_proj.4bit_weight shape: [25165824]
+21: model.layers.1.mlp.gate_up_proj.scale shape: [16384]
+22: model.layers.1.mlp.gate_up_proj.zeroPoint shape: [8192]
+23: model.layers.1.post_attention_layernorm.weight shape: [3072]
+24: model.layers.1.self_attn.o_proj.4bit_weight shape: [4718592]
+25: model.layers.1.self_attn.o_proj.scale shape: [3072]
+26: model.layers.1.self_attn.o_proj.zeroPoint shape: [1536]
+27: model.layers.1.self_attn.qkv_proj.4bit_weight shape: [14155776]
+28: model.layers.1.self_attn.qkv_proj.scale shape: [9216]
+29: model.layers.1.self_attn.qkv_proj.zeroPoint shape: [4608]
+30: model.layers.10.input_layernorm.weight shape: [3072]
+31: model.layers.10.mlp.down_proj.4bit_weight shape: [12582912]
+32: model.layers.10.mlp.down_proj.scale shape: [3072]
+33: model.layers.10.mlp.down_proj.zeroPoint shape: [1536]
+34: model.layers.10.mlp.gate_up_proj.4bit_weight shape: [25165824]
+35: model.layers.10.mlp.gate_up_proj.scale shape: [16384]
+36: model.layers.10.mlp.gate_up_proj.zeroPoint shape: [8192]
+37: model.layers.10.post_attention_layernorm.weight shape: [3072]
+38: model.layers.10.self_attn.o_proj.4bit_weight shape: [4718592]
+39: model.layers.10.self_attn.o_proj.scale shape: [3072]
+40: model.layers.10.self_attn.o_proj.zeroPoint shape: [1536]
+41: model.layers.10.self_attn.qkv_proj.4bit_weight shape: [14155776]
+42: model.layers.10.self_attn.qkv_proj.scale shape: [9216]
+43: model.layers.10.self_attn.qkv_proj.zeroPoint shape: [4608]
+44: model.layers.11.input_layernorm.weight shape: [3072]
+45: model.layers.11.mlp.down_proj.4bit_weight shape: [12582912]
+46: model.layers.11.mlp.down_proj.scale shape: [3072]
+47: model.layers.11.mlp.down_proj.zeroPoint shape: [1536]
+48: model.layers.11.mlp.gate_up_proj.4bit_weight shape: [25165824]
+49: model.layers.11.mlp.gate_up_proj.scale shape: [16384]
+50: model.layers.11.mlp.gate_up_proj.zeroPoint shape: [8192]
+51: model.layers.11.post_attention_layernorm.weight shape: [3072]
+52: model.layers.11.self_attn.o_proj.4bit_weight shape: [4718592]
+53: model.layers.11.self_attn.o_proj.scale shape: [3072]
+54: model.layers.11.self_attn.o_proj.zeroPoint shape: [1536]
+55: model.layers.11.self_attn.qkv_proj.4bit_weight shape: [14155776]
+56: model.layers.11.self_attn.qkv_proj.scale shape: [9216]
+57: model.layers.11.self_attn.qkv_proj.zeroPoint shape: [4608]
+58: model.layers.12.input_layernorm.weight shape: [3072]
+59: model.layers.12.mlp.down_proj.4bit_weight shape: [12582912]
+60: model.layers.12.mlp.down_proj.scale shape: [3072]
+61: model.layers.12.mlp.down_proj.zeroPoint shape: [1536]
+62: model.layers.12.mlp.gate_up_proj.4bit_weight shape: [25165824]
+63: model.layers.12.mlp.gate_up_proj.scale shape: [16384]
+64: model.layers.12.mlp.gate_up_proj.zeroPoint shape: [8192]
+65: model.layers.12.post_attention_layernorm.weight shape: [3072]
+66: model.layers.12.self_attn.o_proj.4bit_weight shape: [4718592]
+67: model.layers.12.self_attn.o_proj.scale shape: [3072]
+68: model.layers.12.self_attn.o_proj.zeroPoint shape: [1536]
+69: model.layers.12.self_attn.qkv_proj.4bit_weight shape: [14155776]
+70: model.layers.12.self_attn.qkv_proj.scale shape: [9216]
+71: model.layers.12.self_attn.qkv_proj.zeroPoint shape: [4608]
+72: model.layers.13.input_layernorm.weight shape: [3072]
+73: model.layers.13.mlp.down_proj.4bit_weight shape: [12582912]
+74: model.layers.13.mlp.down_proj.scale shape: [3072]
+75: model.layers.13.mlp.down_proj.zeroPoint shape: [1536]
+76: model.layers.13.mlp.gate_up_proj.4bit_weight shape: [25165824]
+77: model.layers.13.mlp.gate_up_proj.scale shape: [16384]
+78: model.layers.13.mlp.gate_up_proj.zeroPoint shape: [8192]
+79: model.layers.13.post_attention_layernorm.weight shape: [3072]
+80: model.layers.13.self_attn.o_proj.4bit_weight shape: [4718592]
+81: model.layers.13.self_attn.o_proj.scale shape: [3072]
+82: model.layers.13.self_attn.o_proj.zeroPoint shape: [1536]
+83: model.layers.13.self_attn.qkv_proj.4bit_weight shape: [14155776]
+84: model.layers.13.self_attn.qkv_proj.scale shape: [9216]
+85: model.layers.13.self_attn.qkv_proj.zeroPoint shape: [4608]
+86: model.layers.14.input_layernorm.weight shape: [3072]
+87: model.layers.14.mlp.down_proj.4bit_weight shape: [12582912]
+88: model.layers.14.mlp.down_proj.scale shape: [3072]
+89: model.layers.14.mlp.down_proj.zeroPoint shape: [1536]
+90: model.layers.14.mlp.gate_up_proj.4bit_weight shape: [25165824]
+91: model.layers.14.mlp.gate_up_proj.scale shape: [16384]
+92: model.layers.14.mlp.gate_up_proj.zeroPoint shape: [8192]
+93: model.layers.14.post_attention_layernorm.weight shape: [3072]
+94: model.layers.14.self_attn.o_proj.4bit_weight shape: [4718592]
+95: model.layers.14.self_attn.o_proj.scale shape: [3072]
+96: model.layers.14.self_attn.o_proj.zeroPoint shape: [1536]
+97: model.layers.14.self_attn.qkv_proj.4bit_weight shape: [14155776]
+98: model.layers.14.self_attn.qkv_proj.scale shape: [9216]
+99: model.layers.14.self_attn.qkv_proj.zeroPoint shape: [4608]
+100: model.layers.15.input_layernorm.weight shape: [3072]
+101: model.layers.15.mlp.down_proj.4bit_weight shape: [12582912]
+102: model.layers.15.mlp.down_proj.scale shape: [3072]
+103: model.layers.15.mlp.down_proj.zeroPoint shape: [1536]
+104: model.layers.15.mlp.gate_up_proj.4bit_weight shape: [25165824]
+105: model.layers.15.mlp.gate_up_proj.scale shape: [16384]
+106: model.layers.15.mlp.gate_up_proj.zeroPoint shape: [8192]
+107: model.layers.15.post_attention_layernorm.weight shape: [3072]
+108: model.layers.15.self_attn.o_proj.4bit_weight shape: [4718592]
+109: model.layers.15.self_attn.o_proj.scale shape: [3072]
+110: model.layers.15.self_attn.o_proj.zeroPoint shape: [1536]
+111: model.layers.15.self_attn.qkv_proj.4bit_weight shape: [14155776]
+112: model.layers.15.self_attn.qkv_proj.scale shape: [9216]
+113: model.layers.15.self_attn.qkv_proj.zeroPoint shape: [4608]
+114: model.layers.16.input_layernorm.weight shape: [3072]
+115: model.layers.16.mlp.down_proj.4bit_weight shape: [12582912]
+116: model.layers.16.mlp.down_proj.scale shape: [3072]
+117: model.layers.16.mlp.down_proj.zeroPoint shape: [1536]
+118: model.layers.16.mlp.gate_up_proj.4bit_weight shape: [25165824]
+119: model.layers.16.mlp.gate_up_proj.scale shape: [16384]
+120: model.layers.16.mlp.gate_up_proj.zeroPoint shape: [8192]
+121: model.layers.16.post_attention_layernorm.weight shape: [3072]
+122: model.layers.16.self_attn.o_proj.4bit_weight shape: [4718592]
+123: model.layers.16.self_attn.o_proj.scale shape: [3072]
+124: model.layers.16.self_attn.o_proj.zeroPoint shape: [1536]
+125: model.layers.16.self_attn.qkv_proj.4bit_weight shape: [14155776]
+126: model.layers.16.self_attn.qkv_proj.scale shape: [9216]
+127: model.layers.16.self_attn.qkv_proj.zeroPoint shape: [4608]
+128: model.layers.17.input_layernorm.weight shape: [3072]
+129: model.layers.17.mlp.down_proj.4bit_weight shape: [12582912]
+130: model.layers.17.mlp.down_proj.scale shape: [3072]
+131: model.layers.17.mlp.down_proj.zeroPoint shape: [1536]
+132: model.layers.17.mlp.gate_up_proj.4bit_weight shape: [25165824]
+133: model.layers.17.mlp.gate_up_proj.scale shape: [16384]
+134: model.layers.17.mlp.gate_up_proj.zeroPoint shape: [8192]
+135: model.layers.17.post_attention_layernorm.weight shape: [3072]
+136: model.layers.17.self_attn.o_proj.4bit_weight shape: [4718592]
+137: model.layers.17.self_attn.o_proj.scale shape: [3072]
+138: model.layers.17.self_attn.o_proj.zeroPoint shape: [1536]
+139: model.layers.17.self_attn.qkv_proj.4bit_weight shape: [14155776]
+140: model.layers.17.self_attn.qkv_proj.scale shape: [9216]
+141: model.layers.17.self_attn.qkv_proj.zeroPoint shape: [4608]
+142: model.layers.18.input_layernorm.weight shape: [3072]
+143: model.layers.18.mlp.down_proj.4bit_weight shape: [12582912]
+144: model.layers.18.mlp.down_proj.scale shape: [3072]
+145: model.layers.18.mlp.down_proj.zeroPoint shape: [1536]
+146: model.layers.18.mlp.gate_up_proj.4bit_weight shape: [25165824]
+147: model.layers.18.mlp.gate_up_proj.scale shape: [16384]
+148: model.layers.18.mlp.gate_up_proj.zeroPoint shape: [8192]
+149: model.layers.18.post_attention_layernorm.weight shape: [3072]
+150: model.layers.18.self_attn.o_proj.4bit_weight shape: [4718592]
+151: model.layers.18.self_attn.o_proj.scale shape: [3072]
+152: model.layers.18.self_attn.o_proj.zeroPoint shape: [1536]
+153: model.layers.18.self_attn.qkv_proj.4bit_weight shape: [14155776]
+154: model.layers.18.self_attn.qkv_proj.scale shape: [9216]
+155: model.layers.18.self_attn.qkv_proj.zeroPoint shape: [4608]
+156: model.layers.19.input_layernorm.weight shape: [3072]
+157: model.layers.19.mlp.down_proj.4bit_weight shape: [12582912]
+158: model.layers.19.mlp.down_proj.scale shape: [3072]
+159: model.layers.19.mlp.down_proj.zeroPoint shape: [1536]
+160: model.layers.19.mlp.gate_up_proj.4bit_weight shape: [25165824]
+161: model.layers.19.mlp.gate_up_proj.scale shape: [16384]
+162: model.layers.19.mlp.gate_up_proj.zeroPoint shape: [8192]
+163: model.layers.19.post_attention_layernorm.weight shape: [3072]
+164: model.layers.19.self_attn.o_proj.4bit_weight shape: [4718592]
+165: model.layers.19.self_attn.o_proj.scale shape: [3072]
+166: model.layers.19.self_attn.o_proj.zeroPoint shape: [1536]
+167: model.layers.19.self_attn.qkv_proj.4bit_weight shape: [14155776]
+168: model.layers.19.self_attn.qkv_proj.scale shape: [9216]
+169: model.layers.19.self_attn.qkv_proj.zeroPoint shape: [4608]
+170: model.layers.2.input_layernorm.weight shape: [3072]
+171: model.layers.2.mlp.down_proj.4bit_weight shape: [12582912]
+172: model.layers.2.mlp.down_proj.scale shape: [3072]
+173: model.layers.2.mlp.down_proj.zeroPoint shape: [1536]
+174: model.layers.2.mlp.gate_up_proj.4bit_weight shape: [25165824]
+175: model.layers.2.mlp.gate_up_proj.scale shape: [16384]
+176: model.layers.2.mlp.gate_up_proj.zeroPoint shape: [8192]
+177: model.layers.2.post_attention_layernorm.weight shape: [3072]
+178: model.layers.2.self_attn.o_proj.4bit_weight shape: [4718592]
+179: model.layers.2.self_attn.o_proj.scale shape: [3072]
+180: model.layers.2.self_attn.o_proj.zeroPoint shape: [1536]
+181: model.layers.2.self_attn.qkv_proj.4bit_weight shape: [14155776]
+182: model.layers.2.self_attn.qkv_proj.scale shape: [9216]
+183: model.layers.2.self_attn.qkv_proj.zeroPoint shape: [4608]
+184: model.layers.20.input_layernorm.weight shape: [3072]
+185: model.layers.20.mlp.down_proj.4bit_weight shape: [12582912]
+186: model.layers.20.mlp.down_proj.scale shape: [3072]
+187: model.layers.20.mlp.down_proj.zeroPoint shape: [1536]
+188: model.layers.20.mlp.gate_up_proj.4bit_weight shape: [25165824]
+189: model.layers.20.mlp.gate_up_proj.scale shape: [16384]
+190: model.layers.20.mlp.gate_up_proj.zeroPoint shape: [8192]
+191: model.layers.20.post_attention_layernorm.weight shape: [3072]
+192: model.layers.20.self_attn.o_proj.4bit_weight shape: [4718592]
+193: model.layers.20.self_attn.o_proj.scale shape: [3072]
+194: model.layers.20.self_attn.o_proj.zeroPoint shape: [1536]
+195: model.layers.20.self_attn.qkv_proj.4bit_weight shape: [14155776]
+196: model.layers.20.self_attn.qkv_proj.scale shape: [9216]
+197: model.layers.20.self_attn.qkv_proj.zeroPoint shape: [4608]
+198: model.layers.21.input_layernorm.weight shape: [3072]
+199: model.layers.21.mlp.down_proj.4bit_weight shape: [12582912]
+200: model.layers.21.mlp.down_proj.scale shape: [3072]
+201: model.layers.21.mlp.down_proj.zeroPoint shape: [1536]
+202: model.layers.21.mlp.gate_up_proj.4bit_weight shape: [25165824]
+203: model.layers.21.mlp.gate_up_proj.scale shape: [16384]
+204: model.layers.21.mlp.gate_up_proj.zeroPoint shape: [8192]
+205: model.layers.21.post_attention_layernorm.weight shape: [3072]
+206: model.layers.21.self_attn.o_proj.4bit_weight shape: [4718592]
+207: model.layers.21.self_attn.o_proj.scale shape: [3072]
+208: model.layers.21.self_attn.o_proj.zeroPoint shape: [1536]
+209: model.layers.21.self_attn.qkv_proj.4bit_weight shape: [14155776]
+210: model.layers.21.self_attn.qkv_proj.scale shape: [9216]
+211: model.layers.21.self_attn.qkv_proj.zeroPoint shape: [4608]
+212: model.layers.22.input_layernorm.weight shape: [3072]
+213: model.layers.22.mlp.down_proj.4bit_weight shape: [12582912]
+214: model.layers.22.mlp.down_proj.scale shape: [3072]
+215: model.layers.22.mlp.down_proj.zeroPoint shape: [1536]
+216: model.layers.22.mlp.gate_up_proj.4bit_weight shape: [25165824]
+217: model.layers.22.mlp.gate_up_proj.scale shape: [16384]
+218: model.layers.22.mlp.gate_up_proj.zeroPoint shape: [8192]
+219: model.layers.22.post_attention_layernorm.weight shape: [3072]
+220: model.layers.22.self_attn.o_proj.4bit_weight shape: [4718592]
+221: model.layers.22.self_attn.o_proj.scale shape: [3072]
+222: model.layers.22.self_attn.o_proj.zeroPoint shape: [1536]
+223: model.layers.22.self_attn.qkv_proj.4bit_weight shape: [14155776]
+224: model.layers.22.self_attn.qkv_proj.scale shape: [9216]
+225: model.layers.22.self_attn.qkv_proj.zeroPoint shape: [4608]
+226: model.layers.23.input_layernorm.weight shape: [3072]
+227: model.layers.23.mlp.down_proj.4bit_weight shape: [12582912]
+228: model.layers.23.mlp.down_proj.scale shape: [3072]
+229: model.layers.23.mlp.down_proj.zeroPoint shape: [1536]
+230: model.layers.23.mlp.gate_up_proj.4bit_weight shape: [25165824]
+231: model.layers.23.mlp.gate_up_proj.scale shape: [16384]
+232: model.layers.23.mlp.gate_up_proj.zeroPoint shape: [8192]
+233: model.layers.23.post_attention_layernorm.weight shape: [3072]
+234: model.layers.23.self_attn.o_proj.4bit_weight shape: [4718592]
+235: model.layers.23.self_attn.o_proj.scale shape: [3072]
+236: model.layers.23.self_attn.o_proj.zeroPoint shape: [1536]
+237: model.layers.23.self_attn.qkv_proj.4bit_weight shape: [14155776]
+238: model.layers.23.self_attn.qkv_proj.scale shape: [9216]
+239: model.layers.23.self_attn.qkv_proj.zeroPoint shape: [4608]
+240: model.layers.24.input_layernorm.weight shape: [3072]
+241: model.layers.24.mlp.down_proj.4bit_weight shape: [12582912]
+242: model.layers.24.mlp.down_proj.scale shape: [3072]
+243: model.layers.24.mlp.down_proj.zeroPoint shape: [1536]
+244: model.layers.24.mlp.gate_up_proj.4bit_weight shape: [25165824]
+245: model.layers.24.mlp.gate_up_proj.scale shape: [16384]
+246: model.layers.24.mlp.gate_up_proj.zeroPoint shape: [8192]
+247: model.layers.24.post_attention_layernorm.weight shape: [3072]
+248: model.layers.24.self_attn.o_proj.4bit_weight shape: [4718592]
+249: model.layers.24.self_attn.o_proj.scale shape: [3072]
+250: model.layers.24.self_attn.o_proj.zeroPoint shape: [1536]
+251: model.layers.24.self_attn.qkv_proj.4bit_weight shape: [14155776]
+252: model.layers.24.self_attn.qkv_proj.scale shape: [9216]
+253: model.layers.24.self_attn.qkv_proj.zeroPoint shape: [4608]
+254: model.layers.25.input_layernorm.weight shape: [3072]
+255: model.layers.25.mlp.down_proj.4bit_weight shape: [12582912]
+256: model.layers.25.mlp.down_proj.scale shape: [3072]
+257: model.layers.25.mlp.down_proj.zeroPoint shape: [1536]
+258: model.layers.25.mlp.gate_up_proj.4bit_weight shape: [25165824]
+259: model.layers.25.mlp.gate_up_proj.scale shape: [16384]
+260: model.layers.25.mlp.gate_up_proj.zeroPoint shape: [8192]
+261: model.layers.25.post_attention_layernorm.weight shape: [3072]
+262: model.layers.25.self_attn.o_proj.4bit_weight shape: [4718592]
+263: model.layers.25.self_attn.o_proj.scale shape: [3072]
+264: model.layers.25.self_attn.o_proj.zeroPoint shape: [1536]
+265: model.layers.25.self_attn.qkv_proj.4bit_weight shape: [14155776]
+266: model.layers.25.self_attn.qkv_proj.scale shape: [9216]
+267: model.layers.25.self_attn.qkv_proj.zeroPoint shape: [4608]
+268: model.layers.26.input_layernorm.weight shape: [3072]
+269: model.layers.26.mlp.down_proj.4bit_weight shape: [12582912]
+270: model.layers.26.mlp.down_proj.scale shape: [3072]
+271: model.layers.26.mlp.down_proj.zeroPoint shape: [1536]
+272: model.layers.26.mlp.gate_up_proj.4bit_weight shape: [25165824]
+273: model.layers.26.mlp.gate_up_proj.scale shape: [16384]
+274: model.layers.26.mlp.gate_up_proj.zeroPoint shape: [8192]
+275: model.layers.26.post_attention_layernorm.weight shape: [3072]
+276: model.layers.26.self_attn.o_proj.4bit_weight shape: [4718592]
+277: model.layers.26.self_attn.o_proj.scale shape: [3072]
+278: model.layers.26.self_attn.o_proj.zeroPoint shape: [1536]
+279: model.layers.26.self_attn.qkv_proj.4bit_weight shape: [14155776]
+280: model.layers.26.self_attn.qkv_proj.scale shape: [9216]
+281: model.layers.26.self_attn.qkv_proj.zeroPoint shape: [4608]
+282: model.layers.27.input_layernorm.weight shape: [3072]
+283: model.layers.27.mlp.down_proj.4bit_weight shape: [12582912]
+284: model.layers.27.mlp.down_proj.scale shape: [3072]
+285: model.layers.27.mlp.down_proj.zeroPoint shape: [1536]
+286: model.layers.27.mlp.gate_up_proj.4bit_weight shape: [25165824]
+287: model.layers.27.mlp.gate_up_proj.scale shape: [16384]
+288: model.layers.27.mlp.gate_up_proj.zeroPoint shape: [8192]
+289: model.layers.27.post_attention_layernorm.weight shape: [3072]
+290: model.layers.27.self_attn.o_proj.4bit_weight shape: [4718592]
+291: model.layers.27.self_attn.o_proj.scale shape: [3072]
+292: model.layers.27.self_attn.o_proj.zeroPoint shape: [1536]
+293: model.layers.27.self_attn.qkv_proj.4bit_weight shape: [14155776]
+294: model.layers.27.self_attn.qkv_proj.scale shape: [9216]
+295: model.layers.27.self_attn.qkv_proj.zeroPoint shape: [4608]
+296: model.layers.28.input_layernorm.weight shape: [3072]
+297: model.layers.28.mlp.down_proj.4bit_weight shape: [12582912]
+298: model.layers.28.mlp.down_proj.scale shape: [3072]
+299: model.layers.28.mlp.down_proj.zeroPoint shape: [1536]
+300: model.layers.28.mlp.gate_up_proj.4bit_weight shape: [25165824]
+301: model.layers.28.mlp.gate_up_proj.scale shape: [16384]
+302: model.layers.28.mlp.gate_up_proj.zeroPoint shape: [8192]
+303: model.layers.28.post_attention_layernorm.weight shape: [3072]
+304: model.layers.28.self_attn.o_proj.4bit_weight shape: [4718592]
+305: model.layers.28.self_attn.o_proj.scale shape: [3072]
+306: model.layers.28.self_attn.o_proj.zeroPoint shape: [1536]
+307: model.layers.28.self_attn.qkv_proj.4bit_weight shape: [14155776]
+308: model.layers.28.self_attn.qkv_proj.scale shape: [9216]
+309: model.layers.28.self_attn.qkv_proj.zeroPoint shape: [4608]
+310: model.layers.29.input_layernorm.weight shape: [3072]
+311: model.layers.29.mlp.down_proj.4bit_weight shape: [12582912]
+312: model.layers.29.mlp.down_proj.scale shape: [3072]
+313: model.layers.29.mlp.down_proj.zeroPoint shape: [1536]
+314: model.layers.29.mlp.gate_up_proj.4bit_weight shape: [25165824]
+315: model.layers.29.mlp.gate_up_proj.scale shape: [16384]
+316: model.layers.29.mlp.gate_up_proj.zeroPoint shape: [8192]
+317: model.layers.29.post_attention_layernorm.weight shape: [3072]
+318: model.layers.29.self_attn.o_proj.4bit_weight shape: [4718592]
+319: model.layers.29.self_attn.o_proj.scale shape: [3072]
+320: model.layers.29.self_attn.o_proj.zeroPoint shape: [1536]
+321: model.layers.29.self_attn.qkv_proj.4bit_weight shape: [14155776]
+322: model.layers.29.self_attn.qkv_proj.scale shape: [9216]
+323: model.layers.29.self_attn.qkv_proj.zeroPoint shape: [4608]
+324: model.layers.3.input_layernorm.weight shape: [3072]
+325: model.layers.3.mlp.down_proj.4bit_weight shape: [12582912]
+326: model.layers.3.mlp.down_proj.scale shape: [3072]
+327: model.layers.3.mlp.down_proj.zeroPoint shape: [1536]
+328: model.layers.3.mlp.gate_up_proj.4bit_weight shape: [25165824]
+329: model.layers.3.mlp.gate_up_proj.scale shape: [16384]
+330: model.layers.3.mlp.gate_up_proj.zeroPoint shape: [8192]
+331: model.layers.3.post_attention_layernorm.weight shape: [3072]
+332: model.layers.3.self_attn.o_proj.4bit_weight shape: [4718592]
+333: model.layers.3.self_attn.o_proj.scale shape: [3072]
+334: model.layers.3.self_attn.o_proj.zeroPoint shape: [1536]
+335: model.layers.3.self_attn.qkv_proj.4bit_weight shape: [14155776]
+336: model.layers.3.self_attn.qkv_proj.scale shape: [9216]
+337: model.layers.3.self_attn.qkv_proj.zeroPoint shape: [4608]
+338: model.layers.30.input_layernorm.weight shape: [3072]
+339: model.layers.30.mlp.down_proj.4bit_weight shape: [12582912]
+340: model.layers.30.mlp.down_proj.scale shape: [3072]
+341: model.layers.30.mlp.down_proj.zeroPoint shape: [1536]
+342: model.layers.30.mlp.gate_up_proj.4bit_weight shape: [25165824]
+343: model.layers.30.mlp.gate_up_proj.scale shape: [16384]
+344: model.layers.30.mlp.gate_up_proj.zeroPoint shape: [8192]
+345: model.layers.30.post_attention_layernorm.weight shape: [3072]
+346: model.layers.30.self_attn.o_proj.4bit_weight shape: [4718592]
+347: model.layers.30.self_attn.o_proj.scale shape: [3072]
+348: model.layers.30.self_attn.o_proj.zeroPoint shape: [1536]
+349: model.layers.30.self_attn.qkv_proj.4bit_weight shape: [14155776]
+350: model.layers.30.self_attn.qkv_proj.scale shape: [9216]
+351: model.layers.30.self_attn.qkv_proj.zeroPoint shape: [4608]
+352: model.layers.31.input_layernorm.weight shape: [3072]
+353: model.layers.31.mlp.down_proj.4bit_weight shape: [12582912]
+354: model.layers.31.mlp.down_proj.scale shape: [3072]
+355: model.layers.31.mlp.down_proj.zeroPoint shape: [1536]
+356: model.layers.31.mlp.gate_up_proj.4bit_weight shape: [25165824]
+357: model.layers.31.mlp.gate_up_proj.scale shape: [16384]
+358: model.layers.31.mlp.gate_up_proj.zeroPoint shape: [8192]
+359: model.layers.31.post_attention_layernorm.weight shape: [3072]
+360: model.layers.31.self_attn.o_proj.4bit_weight shape: [4718592]
+361: model.layers.31.self_attn.o_proj.scale shape: [3072]
+362: model.layers.31.self_attn.o_proj.zeroPoint shape: [1536]
+363: model.layers.31.self_attn.qkv_proj.4bit_weight shape: [14155776]
+364: model.layers.31.self_attn.qkv_proj.scale shape: [9216]
+365: model.layers.31.self_attn.qkv_proj.zeroPoint shape: [4608]
+366: model.layers.4.input_layernorm.weight shape: [3072]
+367: model.layers.4.mlp.down_proj.4bit_weight shape: [12582912]
+368: model.layers.4.mlp.down_proj.scale shape: [3072]
+369: model.layers.4.mlp.down_proj.zeroPoint shape: [1536]
+370: model.layers.4.mlp.gate_up_proj.4bit_weight shape: [25165824]
+371: model.layers.4.mlp.gate_up_proj.scale shape: [16384]
+372: model.layers.4.mlp.gate_up_proj.zeroPoint shape: [8192]
+373: model.layers.4.post_attention_layernorm.weight shape: [3072]
+374: model.layers.4.self_attn.o_proj.4bit_weight shape: [4718592]
+375: model.layers.4.self_attn.o_proj.scale shape: [3072]
+376: model.layers.4.self_attn.o_proj.zeroPoint shape: [1536]
+377: model.layers.4.self_attn.qkv_proj.4bit_weight shape: [14155776]
+378: model.layers.4.self_attn.qkv_proj.scale shape: [9216]
+379: model.layers.4.self_attn.qkv_proj.zeroPoint shape: [4608]
+380: model.layers.5.input_layernorm.weight shape: [3072]
+381: model.layers.5.mlp.down_proj.4bit_weight shape: [12582912]
+382: model.layers.5.mlp.down_proj.scale shape: [3072]
+383: model.layers.5.mlp.down_proj.zeroPoint shape: [1536]
+384: model.layers.5.mlp.gate_up_proj.4bit_weight shape: [25165824]
+385: model.layers.5.mlp.gate_up_proj.scale shape: [16384]
+386: model.layers.5.mlp.gate_up_proj.zeroPoint shape: [8192]
+387: model.layers.5.post_attention_layernorm.weight shape: [3072]
+388: model.layers.5.self_attn.o_proj.4bit_weight shape: [4718592]
+389: model.layers.5.self_attn.o_proj.scale shape: [3072]
+390: model.layers.5.self_attn.o_proj.zeroPoint shape: [1536]
+391: model.layers.5.self_attn.qkv_proj.4bit_weight shape: [14155776]
+392: model.layers.5.self_attn.qkv_proj.scale shape: [9216]
+393: model.layers.5.self_attn.qkv_proj.zeroPoint shape: [4608]
+394: model.layers.6.input_layernorm.weight shape: [3072]
+395: model.layers.6.mlp.down_proj.4bit_weight shape: [12582912]
+396: model.layers.6.mlp.down_proj.scale shape: [3072]
+397: model.layers.6.mlp.down_proj.zeroPoint shape: [1536]
+398: model.layers.6.mlp.gate_up_proj.4bit_weight shape: [25165824]
+399: model.layers.6.mlp.gate_up_proj.scale shape: [16384]
+400: model.layers.6.mlp.gate_up_proj.zeroPoint shape: [8192]
+401: model.layers.6.post_attention_layernorm.weight shape: [3072]
+402: model.layers.6.self_attn.o_proj.4bit_weight shape: [4718592]
+403: model.layers.6.self_attn.o_proj.scale shape: [3072]
+404: model.layers.6.self_attn.o_proj.zeroPoint shape: [1536]
+405: model.layers.6.self_attn.qkv_proj.4bit_weight shape: [14155776]
+406: model.layers.6.self_attn.qkv_proj.scale shape: [9216]
+407: model.layers.6.self_attn.qkv_proj.zeroPoint shape: [4608]
+408: model.layers.7.input_layernorm.weight shape: [3072]
+409: model.layers.7.mlp.down_proj.4bit_weight shape: [12582912]
+410: model.layers.7.mlp.down_proj.scale shape: [3072]
+411: model.layers.7.mlp.down_proj.zeroPoint shape: [1536]
+412: model.layers.7.mlp.gate_up_proj.4bit_weight shape: [25165824]
+413: model.layers.7.mlp.gate_up_proj.scale shape: [16384]
+414: model.layers.7.mlp.gate_up_proj.zeroPoint shape: [8192]
+415: model.layers.7.post_attention_layernorm.weight shape: [3072]
+416: model.layers.7.self_attn.o_proj.4bit_weight shape: [4718592]
+417: model.layers.7.self_attn.o_proj.scale shape: [3072]
+418: model.layers.7.self_attn.o_proj.zeroPoint shape: [1536]
+419: model.layers.7.self_attn.qkv_proj.4bit_weight shape: [14155776]
+420: model.layers.7.self_attn.qkv_proj.scale shape: [9216]
+421: model.layers.7.self_attn.qkv_proj.zeroPoint shape: [4608]
+422: model.layers.8.input_layernorm.weight shape: [3072]
+423: model.layers.8.mlp.down_proj.4bit_weight shape: [12582912]
+424: model.layers.8.mlp.down_proj.scale shape: [3072]
+425: model.layers.8.mlp.down_proj.zeroPoint shape: [1536]
+426: model.layers.8.mlp.gate_up_proj.4bit_weight shape: [25165824]
+427: model.layers.8.mlp.gate_up_proj.scale shape: [16384]
+428: model.layers.8.mlp.gate_up_proj.zeroPoint shape: [8192]
+429: model.layers.8.post_attention_layernorm.weight shape: [3072]
+430: model.layers.8.self_attn.o_proj.4bit_weight shape: [4718592]
+431: model.layers.8.self_attn.o_proj.scale shape: [3072]
+432: model.layers.8.self_attn.o_proj.zeroPoint shape: [1536]
+433: model.layers.8.self_attn.qkv_proj.4bit_weight shape: [14155776]
+434: model.layers.8.self_attn.qkv_proj.scale shape: [9216]
+435: model.layers.8.self_attn.qkv_proj.zeroPoint shape: [4608]
+436: model.layers.9.input_layernorm.weight shape: [3072]
+437: model.layers.9.mlp.down_proj.4bit_weight shape: [12582912]
+438: model.layers.9.mlp.down_proj.scale shape: [3072]
+439: model.layers.9.mlp.down_proj.zeroPoint shape: [1536]
+440: model.layers.9.mlp.gate_up_proj.4bit_weight shape: [25165824]
+441: model.layers.9.mlp.gate_up_proj.scale shape: [16384]
+442: model.layers.9.mlp.gate_up_proj.zeroPoint shape: [8192]
+443: model.layers.9.post_attention_layernorm.weight shape: [3072]
+444: model.layers.9.self_attn.o_proj.4bit_weight shape: [4718592]
+445: model.layers.9.self_attn.o_proj.scale shape: [3072]
+446: model.layers.9.self_attn.o_proj.zeroPoint shape: [1536]
+447: model.layers.9.self_attn.qkv_proj.4bit_weight shape: [14155776]
+448: model.layers.9.self_attn.qkv_proj.scale shape: [9216]
+449: model.layers.9.self_attn.qkv_proj.zeroPoint shape: [4608]
+450: model.norm.weight shape: [3072]
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini4KInt8QuantizeShapeTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini4KInt8QuantizeShapeTest.approved.txt
new file mode 100644
index 0000000000..d3ab1d8010
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini4KInt8QuantizeShapeTest.approved.txt
@@ -0,0 +1,451 @@
+0: lm_head.weight shape: [32064, 3072]
+1: model.embed_tokens.weight shape: [32064, 3072]
+2: model.layers.0.input_layernorm.weight shape: [3072]
+3: model.layers.0.mlp.down_proj.8bit_weight shape: [3072, 8192]
+4: model.layers.0.mlp.down_proj.scale shape: [3072]
+5: model.layers.0.mlp.down_proj.zeroPoint shape: [3072]
+6: model.layers.0.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+7: model.layers.0.mlp.gate_up_proj.scale shape: [16384]
+8: model.layers.0.mlp.gate_up_proj.zeroPoint shape: [16384]
+9: model.layers.0.post_attention_layernorm.weight shape: [3072]
+10: model.layers.0.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+11: model.layers.0.self_attn.o_proj.scale shape: [3072]
+12: model.layers.0.self_attn.o_proj.zeroPoint shape: [3072]
+13: model.layers.0.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+14: model.layers.0.self_attn.qkv_proj.scale shape: [9216]
+15: model.layers.0.self_attn.qkv_proj.zeroPoint shape: [9216]
+16: model.layers.1.input_layernorm.weight shape: [3072]
+17: model.layers.1.mlp.down_proj.8bit_weight shape: [3072, 8192]
+18: model.layers.1.mlp.down_proj.scale shape: [3072]
+19: model.layers.1.mlp.down_proj.zeroPoint shape: [3072]
+20: model.layers.1.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+21: model.layers.1.mlp.gate_up_proj.scale shape: [16384]
+22: model.layers.1.mlp.gate_up_proj.zeroPoint shape: [16384]
+23: model.layers.1.post_attention_layernorm.weight shape: [3072]
+24: model.layers.1.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+25: model.layers.1.self_attn.o_proj.scale shape: [3072]
+26: model.layers.1.self_attn.o_proj.zeroPoint shape: [3072]
+27: model.layers.1.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+28: model.layers.1.self_attn.qkv_proj.scale shape: [9216]
+29: model.layers.1.self_attn.qkv_proj.zeroPoint shape: [9216]
+30: model.layers.10.input_layernorm.weight shape: [3072]
+31: model.layers.10.mlp.down_proj.8bit_weight shape: [3072, 8192]
+32: model.layers.10.mlp.down_proj.scale shape: [3072]
+33: model.layers.10.mlp.down_proj.zeroPoint shape: [3072]
+34: model.layers.10.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+35: model.layers.10.mlp.gate_up_proj.scale shape: [16384]
+36: model.layers.10.mlp.gate_up_proj.zeroPoint shape: [16384]
+37: model.layers.10.post_attention_layernorm.weight shape: [3072]
+38: model.layers.10.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+39: model.layers.10.self_attn.o_proj.scale shape: [3072]
+40: model.layers.10.self_attn.o_proj.zeroPoint shape: [3072]
+41: model.layers.10.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+42: model.layers.10.self_attn.qkv_proj.scale shape: [9216]
+43: model.layers.10.self_attn.qkv_proj.zeroPoint shape: [9216]
+44: model.layers.11.input_layernorm.weight shape: [3072]
+45: model.layers.11.mlp.down_proj.8bit_weight shape: [3072, 8192]
+46: model.layers.11.mlp.down_proj.scale shape: [3072]
+47: model.layers.11.mlp.down_proj.zeroPoint shape: [3072]
+48: model.layers.11.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+49: model.layers.11.mlp.gate_up_proj.scale shape: [16384]
+50: model.layers.11.mlp.gate_up_proj.zeroPoint shape: [16384]
+51: model.layers.11.post_attention_layernorm.weight shape: [3072]
+52: model.layers.11.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+53: model.layers.11.self_attn.o_proj.scale shape: [3072]
+54: model.layers.11.self_attn.o_proj.zeroPoint shape: [3072]
+55: model.layers.11.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+56: model.layers.11.self_attn.qkv_proj.scale shape: [9216]
+57: model.layers.11.self_attn.qkv_proj.zeroPoint shape: [9216]
+58: model.layers.12.input_layernorm.weight shape: [3072]
+59: model.layers.12.mlp.down_proj.8bit_weight shape: [3072, 8192]
+60: model.layers.12.mlp.down_proj.scale shape: [3072]
+61: model.layers.12.mlp.down_proj.zeroPoint shape: [3072]
+62: model.layers.12.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+63: model.layers.12.mlp.gate_up_proj.scale shape: [16384]
+64: model.layers.12.mlp.gate_up_proj.zeroPoint shape: [16384]
+65: model.layers.12.post_attention_layernorm.weight shape: [3072]
+66: model.layers.12.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+67: model.layers.12.self_attn.o_proj.scale shape: [3072]
+68: model.layers.12.self_attn.o_proj.zeroPoint shape: [3072]
+69: model.layers.12.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+70: model.layers.12.self_attn.qkv_proj.scale shape: [9216]
+71: model.layers.12.self_attn.qkv_proj.zeroPoint shape: [9216]
+72: model.layers.13.input_layernorm.weight shape: [3072]
+73: model.layers.13.mlp.down_proj.8bit_weight shape: [3072, 8192]
+74: model.layers.13.mlp.down_proj.scale shape: [3072]
+75: model.layers.13.mlp.down_proj.zeroPoint shape: [3072]
+76: model.layers.13.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+77: model.layers.13.mlp.gate_up_proj.scale shape: [16384]
+78: model.layers.13.mlp.gate_up_proj.zeroPoint shape: [16384]
+79: model.layers.13.post_attention_layernorm.weight shape: [3072]
+80: model.layers.13.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+81: model.layers.13.self_attn.o_proj.scale shape: [3072]
+82: model.layers.13.self_attn.o_proj.zeroPoint shape: [3072]
+83: model.layers.13.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+84: model.layers.13.self_attn.qkv_proj.scale shape: [9216]
+85: model.layers.13.self_attn.qkv_proj.zeroPoint shape: [9216]
+86: model.layers.14.input_layernorm.weight shape: [3072]
+87: model.layers.14.mlp.down_proj.8bit_weight shape: [3072, 8192]
+88: model.layers.14.mlp.down_proj.scale shape: [3072]
+89: model.layers.14.mlp.down_proj.zeroPoint shape: [3072]
+90: model.layers.14.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+91: model.layers.14.mlp.gate_up_proj.scale shape: [16384]
+92: model.layers.14.mlp.gate_up_proj.zeroPoint shape: [16384]
+93: model.layers.14.post_attention_layernorm.weight shape: [3072]
+94: model.layers.14.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+95: model.layers.14.self_attn.o_proj.scale shape: [3072]
+96: model.layers.14.self_attn.o_proj.zeroPoint shape: [3072]
+97: model.layers.14.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+98: model.layers.14.self_attn.qkv_proj.scale shape: [9216]
+99: model.layers.14.self_attn.qkv_proj.zeroPoint shape: [9216]
+100: model.layers.15.input_layernorm.weight shape: [3072]
+101: model.layers.15.mlp.down_proj.8bit_weight shape: [3072, 8192]
+102: model.layers.15.mlp.down_proj.scale shape: [3072]
+103: model.layers.15.mlp.down_proj.zeroPoint shape: [3072]
+104: model.layers.15.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+105: model.layers.15.mlp.gate_up_proj.scale shape: [16384]
+106: model.layers.15.mlp.gate_up_proj.zeroPoint shape: [16384]
+107: model.layers.15.post_attention_layernorm.weight shape: [3072]
+108: model.layers.15.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+109: model.layers.15.self_attn.o_proj.scale shape: [3072]
+110: model.layers.15.self_attn.o_proj.zeroPoint shape: [3072]
+111: model.layers.15.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+112: model.layers.15.self_attn.qkv_proj.scale shape: [9216]
+113: model.layers.15.self_attn.qkv_proj.zeroPoint shape: [9216]
+114: model.layers.16.input_layernorm.weight shape: [3072]
+115: model.layers.16.mlp.down_proj.8bit_weight shape: [3072, 8192]
+116: model.layers.16.mlp.down_proj.scale shape: [3072]
+117: model.layers.16.mlp.down_proj.zeroPoint shape: [3072]
+118: model.layers.16.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+119: model.layers.16.mlp.gate_up_proj.scale shape: [16384]
+120: model.layers.16.mlp.gate_up_proj.zeroPoint shape: [16384]
+121: model.layers.16.post_attention_layernorm.weight shape: [3072]
+122: model.layers.16.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+123: model.layers.16.self_attn.o_proj.scale shape: [3072]
+124: model.layers.16.self_attn.o_proj.zeroPoint shape: [3072]
+125: model.layers.16.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+126: model.layers.16.self_attn.qkv_proj.scale shape: [9216]
+127: model.layers.16.self_attn.qkv_proj.zeroPoint shape: [9216]
+128: model.layers.17.input_layernorm.weight shape: [3072]
+129: model.layers.17.mlp.down_proj.8bit_weight shape: [3072, 8192]
+130: model.layers.17.mlp.down_proj.scale shape: [3072]
+131: model.layers.17.mlp.down_proj.zeroPoint shape: [3072]
+132: model.layers.17.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+133: model.layers.17.mlp.gate_up_proj.scale shape: [16384]
+134: model.layers.17.mlp.gate_up_proj.zeroPoint shape: [16384]
+135: model.layers.17.post_attention_layernorm.weight shape: [3072]
+136: model.layers.17.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+137: model.layers.17.self_attn.o_proj.scale shape: [3072]
+138: model.layers.17.self_attn.o_proj.zeroPoint shape: [3072]
+139: model.layers.17.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+140: model.layers.17.self_attn.qkv_proj.scale shape: [9216]
+141: model.layers.17.self_attn.qkv_proj.zeroPoint shape: [9216]
+142: model.layers.18.input_layernorm.weight shape: [3072]
+143: model.layers.18.mlp.down_proj.8bit_weight shape: [3072, 8192]
+144: model.layers.18.mlp.down_proj.scale shape: [3072]
+145: model.layers.18.mlp.down_proj.zeroPoint shape: [3072]
+146: model.layers.18.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+147: model.layers.18.mlp.gate_up_proj.scale shape: [16384]
+148: model.layers.18.mlp.gate_up_proj.zeroPoint shape: [16384]
+149: model.layers.18.post_attention_layernorm.weight shape: [3072]
+150: model.layers.18.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+151: model.layers.18.self_attn.o_proj.scale shape: [3072]
+152: model.layers.18.self_attn.o_proj.zeroPoint shape: [3072]
+153: model.layers.18.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+154: model.layers.18.self_attn.qkv_proj.scale shape: [9216]
+155: model.layers.18.self_attn.qkv_proj.zeroPoint shape: [9216]
+156: model.layers.19.input_layernorm.weight shape: [3072]
+157: model.layers.19.mlp.down_proj.8bit_weight shape: [3072, 8192]
+158: model.layers.19.mlp.down_proj.scale shape: [3072]
+159: model.layers.19.mlp.down_proj.zeroPoint shape: [3072]
+160: model.layers.19.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+161: model.layers.19.mlp.gate_up_proj.scale shape: [16384]
+162: model.layers.19.mlp.gate_up_proj.zeroPoint shape: [16384]
+163: model.layers.19.post_attention_layernorm.weight shape: [3072]
+164: model.layers.19.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+165: model.layers.19.self_attn.o_proj.scale shape: [3072]
+166: model.layers.19.self_attn.o_proj.zeroPoint shape: [3072]
+167: model.layers.19.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+168: model.layers.19.self_attn.qkv_proj.scale shape: [9216]
+169: model.layers.19.self_attn.qkv_proj.zeroPoint shape: [9216]
+170: model.layers.2.input_layernorm.weight shape: [3072]
+171: model.layers.2.mlp.down_proj.8bit_weight shape: [3072, 8192]
+172: model.layers.2.mlp.down_proj.scale shape: [3072]
+173: model.layers.2.mlp.down_proj.zeroPoint shape: [3072]
+174: model.layers.2.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+175: model.layers.2.mlp.gate_up_proj.scale shape: [16384]
+176: model.layers.2.mlp.gate_up_proj.zeroPoint shape: [16384]
+177: model.layers.2.post_attention_layernorm.weight shape: [3072]
+178: model.layers.2.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+179: model.layers.2.self_attn.o_proj.scale shape: [3072]
+180: model.layers.2.self_attn.o_proj.zeroPoint shape: [3072]
+181: model.layers.2.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+182: model.layers.2.self_attn.qkv_proj.scale shape: [9216]
+183: model.layers.2.self_attn.qkv_proj.zeroPoint shape: [9216]
+184: model.layers.20.input_layernorm.weight shape: [3072]
+185: model.layers.20.mlp.down_proj.8bit_weight shape: [3072, 8192]
+186: model.layers.20.mlp.down_proj.scale shape: [3072]
+187: model.layers.20.mlp.down_proj.zeroPoint shape: [3072]
+188: model.layers.20.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+189: model.layers.20.mlp.gate_up_proj.scale shape: [16384]
+190: model.layers.20.mlp.gate_up_proj.zeroPoint shape: [16384]
+191: model.layers.20.post_attention_layernorm.weight shape: [3072]
+192: model.layers.20.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+193: model.layers.20.self_attn.o_proj.scale shape: [3072]
+194: model.layers.20.self_attn.o_proj.zeroPoint shape: [3072]
+195: model.layers.20.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+196: model.layers.20.self_attn.qkv_proj.scale shape: [9216]
+197: model.layers.20.self_attn.qkv_proj.zeroPoint shape: [9216]
+198: model.layers.21.input_layernorm.weight shape: [3072]
+199: model.layers.21.mlp.down_proj.8bit_weight shape: [3072, 8192]
+200: model.layers.21.mlp.down_proj.scale shape: [3072]
+201: model.layers.21.mlp.down_proj.zeroPoint shape: [3072]
+202: model.layers.21.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+203: model.layers.21.mlp.gate_up_proj.scale shape: [16384]
+204: model.layers.21.mlp.gate_up_proj.zeroPoint shape: [16384]
+205: model.layers.21.post_attention_layernorm.weight shape: [3072]
+206: model.layers.21.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+207: model.layers.21.self_attn.o_proj.scale shape: [3072]
+208: model.layers.21.self_attn.o_proj.zeroPoint shape: [3072]
+209: model.layers.21.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+210: model.layers.21.self_attn.qkv_proj.scale shape: [9216]
+211: model.layers.21.self_attn.qkv_proj.zeroPoint shape: [9216]
+212: model.layers.22.input_layernorm.weight shape: [3072]
+213: model.layers.22.mlp.down_proj.8bit_weight shape: [3072, 8192]
+214: model.layers.22.mlp.down_proj.scale shape: [3072]
+215: model.layers.22.mlp.down_proj.zeroPoint shape: [3072]
+216: model.layers.22.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+217: model.layers.22.mlp.gate_up_proj.scale shape: [16384]
+218: model.layers.22.mlp.gate_up_proj.zeroPoint shape: [16384]
+219: model.layers.22.post_attention_layernorm.weight shape: [3072]
+220: model.layers.22.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+221: model.layers.22.self_attn.o_proj.scale shape: [3072]
+222: model.layers.22.self_attn.o_proj.zeroPoint shape: [3072]
+223: model.layers.22.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+224: model.layers.22.self_attn.qkv_proj.scale shape: [9216]
+225: model.layers.22.self_attn.qkv_proj.zeroPoint shape: [9216]
+226: model.layers.23.input_layernorm.weight shape: [3072]
+227: model.layers.23.mlp.down_proj.8bit_weight shape: [3072, 8192]
+228: model.layers.23.mlp.down_proj.scale shape: [3072]
+229: model.layers.23.mlp.down_proj.zeroPoint shape: [3072]
+230: model.layers.23.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+231: model.layers.23.mlp.gate_up_proj.scale shape: [16384]
+232: model.layers.23.mlp.gate_up_proj.zeroPoint shape: [16384]
+233: model.layers.23.post_attention_layernorm.weight shape: [3072]
+234: model.layers.23.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+235: model.layers.23.self_attn.o_proj.scale shape: [3072]
+236: model.layers.23.self_attn.o_proj.zeroPoint shape: [3072]
+237: model.layers.23.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+238: model.layers.23.self_attn.qkv_proj.scale shape: [9216]
+239: model.layers.23.self_attn.qkv_proj.zeroPoint shape: [9216]
+240: model.layers.24.input_layernorm.weight shape: [3072]
+241: model.layers.24.mlp.down_proj.8bit_weight shape: [3072, 8192]
+242: model.layers.24.mlp.down_proj.scale shape: [3072]
+243: model.layers.24.mlp.down_proj.zeroPoint shape: [3072]
+244: model.layers.24.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+245: model.layers.24.mlp.gate_up_proj.scale shape: [16384]
+246: model.layers.24.mlp.gate_up_proj.zeroPoint shape: [16384]
+247: model.layers.24.post_attention_layernorm.weight shape: [3072]
+248: model.layers.24.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+249: model.layers.24.self_attn.o_proj.scale shape: [3072]
+250: model.layers.24.self_attn.o_proj.zeroPoint shape: [3072]
+251: model.layers.24.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+252: model.layers.24.self_attn.qkv_proj.scale shape: [9216]
+253: model.layers.24.self_attn.qkv_proj.zeroPoint shape: [9216]
+254: model.layers.25.input_layernorm.weight shape: [3072]
+255: model.layers.25.mlp.down_proj.8bit_weight shape: [3072, 8192]
+256: model.layers.25.mlp.down_proj.scale shape: [3072]
+257: model.layers.25.mlp.down_proj.zeroPoint shape: [3072]
+258: model.layers.25.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+259: model.layers.25.mlp.gate_up_proj.scale shape: [16384]
+260: model.layers.25.mlp.gate_up_proj.zeroPoint shape: [16384]
+261: model.layers.25.post_attention_layernorm.weight shape: [3072]
+262: model.layers.25.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+263: model.layers.25.self_attn.o_proj.scale shape: [3072]
+264: model.layers.25.self_attn.o_proj.zeroPoint shape: [3072]
+265: model.layers.25.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+266: model.layers.25.self_attn.qkv_proj.scale shape: [9216]
+267: model.layers.25.self_attn.qkv_proj.zeroPoint shape: [9216]
+268: model.layers.26.input_layernorm.weight shape: [3072]
+269: model.layers.26.mlp.down_proj.8bit_weight shape: [3072, 8192]
+270: model.layers.26.mlp.down_proj.scale shape: [3072]
+271: model.layers.26.mlp.down_proj.zeroPoint shape: [3072]
+272: model.layers.26.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+273: model.layers.26.mlp.gate_up_proj.scale shape: [16384]
+274: model.layers.26.mlp.gate_up_proj.zeroPoint shape: [16384]
+275: model.layers.26.post_attention_layernorm.weight shape: [3072]
+276: model.layers.26.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+277: model.layers.26.self_attn.o_proj.scale shape: [3072]
+278: model.layers.26.self_attn.o_proj.zeroPoint shape: [3072]
+279: model.layers.26.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+280: model.layers.26.self_attn.qkv_proj.scale shape: [9216]
+281: model.layers.26.self_attn.qkv_proj.zeroPoint shape: [9216]
+282: model.layers.27.input_layernorm.weight shape: [3072]
+283: model.layers.27.mlp.down_proj.8bit_weight shape: [3072, 8192]
+284: model.layers.27.mlp.down_proj.scale shape: [3072]
+285: model.layers.27.mlp.down_proj.zeroPoint shape: [3072]
+286: model.layers.27.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+287: model.layers.27.mlp.gate_up_proj.scale shape: [16384]
+288: model.layers.27.mlp.gate_up_proj.zeroPoint shape: [16384]
+289: model.layers.27.post_attention_layernorm.weight shape: [3072]
+290: model.layers.27.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+291: model.layers.27.self_attn.o_proj.scale shape: [3072]
+292: model.layers.27.self_attn.o_proj.zeroPoint shape: [3072]
+293: model.layers.27.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+294: model.layers.27.self_attn.qkv_proj.scale shape: [9216]
+295: model.layers.27.self_attn.qkv_proj.zeroPoint shape: [9216]
+296: model.layers.28.input_layernorm.weight shape: [3072]
+297: model.layers.28.mlp.down_proj.8bit_weight shape: [3072, 8192]
+298: model.layers.28.mlp.down_proj.scale shape: [3072]
+299: model.layers.28.mlp.down_proj.zeroPoint shape: [3072]
+300: model.layers.28.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+301: model.layers.28.mlp.gate_up_proj.scale shape: [16384]
+302: model.layers.28.mlp.gate_up_proj.zeroPoint shape: [16384]
+303: model.layers.28.post_attention_layernorm.weight shape: [3072]
+304: model.layers.28.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+305: model.layers.28.self_attn.o_proj.scale shape: [3072]
+306: model.layers.28.self_attn.o_proj.zeroPoint shape: [3072]
+307: model.layers.28.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+308: model.layers.28.self_attn.qkv_proj.scale shape: [9216]
+309: model.layers.28.self_attn.qkv_proj.zeroPoint shape: [9216]
+310: model.layers.29.input_layernorm.weight shape: [3072]
+311: model.layers.29.mlp.down_proj.8bit_weight shape: [3072, 8192]
+312: model.layers.29.mlp.down_proj.scale shape: [3072]
+313: model.layers.29.mlp.down_proj.zeroPoint shape: [3072]
+314: model.layers.29.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+315: model.layers.29.mlp.gate_up_proj.scale shape: [16384]
+316: model.layers.29.mlp.gate_up_proj.zeroPoint shape: [16384]
+317: model.layers.29.post_attention_layernorm.weight shape: [3072]
+318: model.layers.29.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+319: model.layers.29.self_attn.o_proj.scale shape: [3072]
+320: model.layers.29.self_attn.o_proj.zeroPoint shape: [3072]
+321: model.layers.29.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+322: model.layers.29.self_attn.qkv_proj.scale shape: [9216]
+323: model.layers.29.self_attn.qkv_proj.zeroPoint shape: [9216]
+324: model.layers.3.input_layernorm.weight shape: [3072]
+325: model.layers.3.mlp.down_proj.8bit_weight shape: [3072, 8192]
+326: model.layers.3.mlp.down_proj.scale shape: [3072]
+327: model.layers.3.mlp.down_proj.zeroPoint shape: [3072]
+328: model.layers.3.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+329: model.layers.3.mlp.gate_up_proj.scale shape: [16384]
+330: model.layers.3.mlp.gate_up_proj.zeroPoint shape: [16384]
+331: model.layers.3.post_attention_layernorm.weight shape: [3072]
+332: model.layers.3.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+333: model.layers.3.self_attn.o_proj.scale shape: [3072]
+334: model.layers.3.self_attn.o_proj.zeroPoint shape: [3072]
+335: model.layers.3.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+336: model.layers.3.self_attn.qkv_proj.scale shape: [9216]
+337: model.layers.3.self_attn.qkv_proj.zeroPoint shape: [9216]
+338: model.layers.30.input_layernorm.weight shape: [3072]
+339: model.layers.30.mlp.down_proj.8bit_weight shape: [3072, 8192]
+340: model.layers.30.mlp.down_proj.scale shape: [3072]
+341: model.layers.30.mlp.down_proj.zeroPoint shape: [3072]
+342: model.layers.30.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+343: model.layers.30.mlp.gate_up_proj.scale shape: [16384]
+344: model.layers.30.mlp.gate_up_proj.zeroPoint shape: [16384]
+345: model.layers.30.post_attention_layernorm.weight shape: [3072]
+346: model.layers.30.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+347: model.layers.30.self_attn.o_proj.scale shape: [3072]
+348: model.layers.30.self_attn.o_proj.zeroPoint shape: [3072]
+349: model.layers.30.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+350: model.layers.30.self_attn.qkv_proj.scale shape: [9216]
+351: model.layers.30.self_attn.qkv_proj.zeroPoint shape: [9216]
+352: model.layers.31.input_layernorm.weight shape: [3072]
+353: model.layers.31.mlp.down_proj.8bit_weight shape: [3072, 8192]
+354: model.layers.31.mlp.down_proj.scale shape: [3072]
+355: model.layers.31.mlp.down_proj.zeroPoint shape: [3072]
+356: model.layers.31.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+357: model.layers.31.mlp.gate_up_proj.scale shape: [16384]
+358: model.layers.31.mlp.gate_up_proj.zeroPoint shape: [16384]
+359: model.layers.31.post_attention_layernorm.weight shape: [3072]
+360: model.layers.31.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+361: model.layers.31.self_attn.o_proj.scale shape: [3072]
+362: model.layers.31.self_attn.o_proj.zeroPoint shape: [3072]
+363: model.layers.31.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+364: model.layers.31.self_attn.qkv_proj.scale shape: [9216]
+365: model.layers.31.self_attn.qkv_proj.zeroPoint shape: [9216]
+366: model.layers.4.input_layernorm.weight shape: [3072]
+367: model.layers.4.mlp.down_proj.8bit_weight shape: [3072, 8192]
+368: model.layers.4.mlp.down_proj.scale shape: [3072]
+369: model.layers.4.mlp.down_proj.zeroPoint shape: [3072]
+370: model.layers.4.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+371: model.layers.4.mlp.gate_up_proj.scale shape: [16384]
+372: model.layers.4.mlp.gate_up_proj.zeroPoint shape: [16384]
+373: model.layers.4.post_attention_layernorm.weight shape: [3072]
+374: model.layers.4.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+375: model.layers.4.self_attn.o_proj.scale shape: [3072]
+376: model.layers.4.self_attn.o_proj.zeroPoint shape: [3072]
+377: model.layers.4.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+378: model.layers.4.self_attn.qkv_proj.scale shape: [9216]
+379: model.layers.4.self_attn.qkv_proj.zeroPoint shape: [9216]
+380: model.layers.5.input_layernorm.weight shape: [3072]
+381: model.layers.5.mlp.down_proj.8bit_weight shape: [3072, 8192]
+382: model.layers.5.mlp.down_proj.scale shape: [3072]
+383: model.layers.5.mlp.down_proj.zeroPoint shape: [3072]
+384: model.layers.5.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+385: model.layers.5.mlp.gate_up_proj.scale shape: [16384]
+386: model.layers.5.mlp.gate_up_proj.zeroPoint shape: [16384]
+387: model.layers.5.post_attention_layernorm.weight shape: [3072]
+388: model.layers.5.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+389: model.layers.5.self_attn.o_proj.scale shape: [3072]
+390: model.layers.5.self_attn.o_proj.zeroPoint shape: [3072]
+391: model.layers.5.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+392: model.layers.5.self_attn.qkv_proj.scale shape: [9216]
+393: model.layers.5.self_attn.qkv_proj.zeroPoint shape: [9216]
+394: model.layers.6.input_layernorm.weight shape: [3072]
+395: model.layers.6.mlp.down_proj.8bit_weight shape: [3072, 8192]
+396: model.layers.6.mlp.down_proj.scale shape: [3072]
+397: model.layers.6.mlp.down_proj.zeroPoint shape: [3072]
+398: model.layers.6.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+399: model.layers.6.mlp.gate_up_proj.scale shape: [16384]
+400: model.layers.6.mlp.gate_up_proj.zeroPoint shape: [16384]
+401: model.layers.6.post_attention_layernorm.weight shape: [3072]
+402: model.layers.6.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+403: model.layers.6.self_attn.o_proj.scale shape: [3072]
+404: model.layers.6.self_attn.o_proj.zeroPoint shape: [3072]
+405: model.layers.6.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+406: model.layers.6.self_attn.qkv_proj.scale shape: [9216]
+407: model.layers.6.self_attn.qkv_proj.zeroPoint shape: [9216]
+408: model.layers.7.input_layernorm.weight shape: [3072]
+409: model.layers.7.mlp.down_proj.8bit_weight shape: [3072, 8192]
+410: model.layers.7.mlp.down_proj.scale shape: [3072]
+411: model.layers.7.mlp.down_proj.zeroPoint shape: [3072]
+412: model.layers.7.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+413: model.layers.7.mlp.gate_up_proj.scale shape: [16384]
+414: model.layers.7.mlp.gate_up_proj.zeroPoint shape: [16384]
+415: model.layers.7.post_attention_layernorm.weight shape: [3072]
+416: model.layers.7.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+417: model.layers.7.self_attn.o_proj.scale shape: [3072]
+418: model.layers.7.self_attn.o_proj.zeroPoint shape: [3072]
+419: model.layers.7.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+420: model.layers.7.self_attn.qkv_proj.scale shape: [9216]
+421: model.layers.7.self_attn.qkv_proj.zeroPoint shape: [9216]
+422: model.layers.8.input_layernorm.weight shape: [3072]
+423: model.layers.8.mlp.down_proj.8bit_weight shape: [3072, 8192]
+424: model.layers.8.mlp.down_proj.scale shape: [3072]
+425: model.layers.8.mlp.down_proj.zeroPoint shape: [3072]
+426: model.layers.8.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+427: model.layers.8.mlp.gate_up_proj.scale shape: [16384]
+428: model.layers.8.mlp.gate_up_proj.zeroPoint shape: [16384]
+429: model.layers.8.post_attention_layernorm.weight shape: [3072]
+430: model.layers.8.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+431: model.layers.8.self_attn.o_proj.scale shape: [3072]
+432: model.layers.8.self_attn.o_proj.zeroPoint shape: [3072]
+433: model.layers.8.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+434: model.layers.8.self_attn.qkv_proj.scale shape: [9216]
+435: model.layers.8.self_attn.qkv_proj.zeroPoint shape: [9216]
+436: model.layers.9.input_layernorm.weight shape: [3072]
+437: model.layers.9.mlp.down_proj.8bit_weight shape: [3072, 8192]
+438: model.layers.9.mlp.down_proj.scale shape: [3072]
+439: model.layers.9.mlp.down_proj.zeroPoint shape: [3072]
+440: model.layers.9.mlp.gate_up_proj.8bit_weight shape: [16384, 3072]
+441: model.layers.9.mlp.gate_up_proj.scale shape: [16384]
+442: model.layers.9.mlp.gate_up_proj.zeroPoint shape: [16384]
+443: model.layers.9.post_attention_layernorm.weight shape: [3072]
+444: model.layers.9.self_attn.o_proj.8bit_weight shape: [3072, 3072]
+445: model.layers.9.self_attn.o_proj.scale shape: [3072]
+446: model.layers.9.self_attn.o_proj.zeroPoint shape: [3072]
+447: model.layers.9.self_attn.qkv_proj.8bit_weight shape: [9216, 3072]
+448: model.layers.9.self_attn.qkv_proj.scale shape: [9216]
+449: model.layers.9.self_attn.qkv_proj.zeroPoint shape: [9216]
+450: model.norm.weight shape: [3072]
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini4KShapeTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini4KShapeTest.approved.txt
new file mode 100644
index 0000000000..2278f3b67d
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.Phi3Mini4KShapeTest.approved.txt
@@ -0,0 +1,195 @@
+0: lm_head.weight shape: [32064, 3072]
+1: model.embed_tokens.weight shape: [32064, 3072]
+2: model.layers.0.input_layernorm.weight shape: [3072]
+3: model.layers.0.mlp.down_proj.weight shape: [3072, 8192]
+4: model.layers.0.mlp.gate_up_proj.weight shape: [16384, 3072]
+5: model.layers.0.post_attention_layernorm.weight shape: [3072]
+6: model.layers.0.self_attn.o_proj.weight shape: [3072, 3072]
+7: model.layers.0.self_attn.qkv_proj.weight shape: [9216, 3072]
+8: model.layers.1.input_layernorm.weight shape: [3072]
+9: model.layers.1.mlp.down_proj.weight shape: [3072, 8192]
+10: model.layers.1.mlp.gate_up_proj.weight shape: [16384, 3072]
+11: model.layers.1.post_attention_layernorm.weight shape: [3072]
+12: model.layers.1.self_attn.o_proj.weight shape: [3072, 3072]
+13: model.layers.1.self_attn.qkv_proj.weight shape: [9216, 3072]
+14: model.layers.10.input_layernorm.weight shape: [3072]
+15: model.layers.10.mlp.down_proj.weight shape: [3072, 8192]
+16: model.layers.10.mlp.gate_up_proj.weight shape: [16384, 3072]
+17: model.layers.10.post_attention_layernorm.weight shape: [3072]
+18: model.layers.10.self_attn.o_proj.weight shape: [3072, 3072]
+19: model.layers.10.self_attn.qkv_proj.weight shape: [9216, 3072]
+20: model.layers.11.input_layernorm.weight shape: [3072]
+21: model.layers.11.mlp.down_proj.weight shape: [3072, 8192]
+22: model.layers.11.mlp.gate_up_proj.weight shape: [16384, 3072]
+23: model.layers.11.post_attention_layernorm.weight shape: [3072]
+24: model.layers.11.self_attn.o_proj.weight shape: [3072, 3072]
+25: model.layers.11.self_attn.qkv_proj.weight shape: [9216, 3072]
+26: model.layers.12.input_layernorm.weight shape: [3072]
+27: model.layers.12.mlp.down_proj.weight shape: [3072, 8192]
+28: model.layers.12.mlp.gate_up_proj.weight shape: [16384, 3072]
+29: model.layers.12.post_attention_layernorm.weight shape: [3072]
+30: model.layers.12.self_attn.o_proj.weight shape: [3072, 3072]
+31: model.layers.12.self_attn.qkv_proj.weight shape: [9216, 3072]
+32: model.layers.13.input_layernorm.weight shape: [3072]
+33: model.layers.13.mlp.down_proj.weight shape: [3072, 8192]
+34: model.layers.13.mlp.gate_up_proj.weight shape: [16384, 3072]
+35: model.layers.13.post_attention_layernorm.weight shape: [3072]
+36: model.layers.13.self_attn.o_proj.weight shape: [3072, 3072]
+37: model.layers.13.self_attn.qkv_proj.weight shape: [9216, 3072]
+38: model.layers.14.input_layernorm.weight shape: [3072]
+39: model.layers.14.mlp.down_proj.weight shape: [3072, 8192]
+40: model.layers.14.mlp.gate_up_proj.weight shape: [16384, 3072]
+41: model.layers.14.post_attention_layernorm.weight shape: [3072]
+42: model.layers.14.self_attn.o_proj.weight shape: [3072, 3072]
+43: model.layers.14.self_attn.qkv_proj.weight shape: [9216, 3072]
+44: model.layers.15.input_layernorm.weight shape: [3072]
+45: model.layers.15.mlp.down_proj.weight shape: [3072, 8192]
+46: model.layers.15.mlp.gate_up_proj.weight shape: [16384, 3072]
+47: model.layers.15.post_attention_layernorm.weight shape: [3072]
+48: model.layers.15.self_attn.o_proj.weight shape: [3072, 3072]
+49: model.layers.15.self_attn.qkv_proj.weight shape: [9216, 3072]
+50: model.layers.16.input_layernorm.weight shape: [3072]
+51: model.layers.16.mlp.down_proj.weight shape: [3072, 8192]
+52: model.layers.16.mlp.gate_up_proj.weight shape: [16384, 3072]
+53: model.layers.16.post_attention_layernorm.weight shape: [3072]
+54: model.layers.16.self_attn.o_proj.weight shape: [3072, 3072]
+55: model.layers.16.self_attn.qkv_proj.weight shape: [9216, 3072]
+56: model.layers.17.input_layernorm.weight shape: [3072]
+57: model.layers.17.mlp.down_proj.weight shape: [3072, 8192]
+58: model.layers.17.mlp.gate_up_proj.weight shape: [16384, 3072]
+59: model.layers.17.post_attention_layernorm.weight shape: [3072]
+60: model.layers.17.self_attn.o_proj.weight shape: [3072, 3072]
+61: model.layers.17.self_attn.qkv_proj.weight shape: [9216, 3072]
+62: model.layers.18.input_layernorm.weight shape: [3072]
+63: model.layers.18.mlp.down_proj.weight shape: [3072, 8192]
+64: model.layers.18.mlp.gate_up_proj.weight shape: [16384, 3072]
+65: model.layers.18.post_attention_layernorm.weight shape: [3072]
+66: model.layers.18.self_attn.o_proj.weight shape: [3072, 3072]
+67: model.layers.18.self_attn.qkv_proj.weight shape: [9216, 3072]
+68: model.layers.19.input_layernorm.weight shape: [3072]
+69: model.layers.19.mlp.down_proj.weight shape: [3072, 8192]
+70: model.layers.19.mlp.gate_up_proj.weight shape: [16384, 3072]
+71: model.layers.19.post_attention_layernorm.weight shape: [3072]
+72: model.layers.19.self_attn.o_proj.weight shape: [3072, 3072]
+73: model.layers.19.self_attn.qkv_proj.weight shape: [9216, 3072]
+74: model.layers.2.input_layernorm.weight shape: [3072]
+75: model.layers.2.mlp.down_proj.weight shape: [3072, 8192]
+76: model.layers.2.mlp.gate_up_proj.weight shape: [16384, 3072]
+77: model.layers.2.post_attention_layernorm.weight shape: [3072]
+78: model.layers.2.self_attn.o_proj.weight shape: [3072, 3072]
+79: model.layers.2.self_attn.qkv_proj.weight shape: [9216, 3072]
+80: model.layers.20.input_layernorm.weight shape: [3072]
+81: model.layers.20.mlp.down_proj.weight shape: [3072, 8192]
+82: model.layers.20.mlp.gate_up_proj.weight shape: [16384, 3072]
+83: model.layers.20.post_attention_layernorm.weight shape: [3072]
+84: model.layers.20.self_attn.o_proj.weight shape: [3072, 3072]
+85: model.layers.20.self_attn.qkv_proj.weight shape: [9216, 3072]
+86: model.layers.21.input_layernorm.weight shape: [3072]
+87: model.layers.21.mlp.down_proj.weight shape: [3072, 8192]
+88: model.layers.21.mlp.gate_up_proj.weight shape: [16384, 3072]
+89: model.layers.21.post_attention_layernorm.weight shape: [3072]
+90: model.layers.21.self_attn.o_proj.weight shape: [3072, 3072]
+91: model.layers.21.self_attn.qkv_proj.weight shape: [9216, 3072]
+92: model.layers.22.input_layernorm.weight shape: [3072]
+93: model.layers.22.mlp.down_proj.weight shape: [3072, 8192]
+94: model.layers.22.mlp.gate_up_proj.weight shape: [16384, 3072]
+95: model.layers.22.post_attention_layernorm.weight shape: [3072]
+96: model.layers.22.self_attn.o_proj.weight shape: [3072, 3072]
+97: model.layers.22.self_attn.qkv_proj.weight shape: [9216, 3072]
+98: model.layers.23.input_layernorm.weight shape: [3072]
+99: model.layers.23.mlp.down_proj.weight shape: [3072, 8192]
+100: model.layers.23.mlp.gate_up_proj.weight shape: [16384, 3072]
+101: model.layers.23.post_attention_layernorm.weight shape: [3072]
+102: model.layers.23.self_attn.o_proj.weight shape: [3072, 3072]
+103: model.layers.23.self_attn.qkv_proj.weight shape: [9216, 3072]
+104: model.layers.24.input_layernorm.weight shape: [3072]
+105: model.layers.24.mlp.down_proj.weight shape: [3072, 8192]
+106: model.layers.24.mlp.gate_up_proj.weight shape: [16384, 3072]
+107: model.layers.24.post_attention_layernorm.weight shape: [3072]
+108: model.layers.24.self_attn.o_proj.weight shape: [3072, 3072]
+109: model.layers.24.self_attn.qkv_proj.weight shape: [9216, 3072]
+110: model.layers.25.input_layernorm.weight shape: [3072]
+111: model.layers.25.mlp.down_proj.weight shape: [3072, 8192]
+112: model.layers.25.mlp.gate_up_proj.weight shape: [16384, 3072]
+113: model.layers.25.post_attention_layernorm.weight shape: [3072]
+114: model.layers.25.self_attn.o_proj.weight shape: [3072, 3072]
+115: model.layers.25.self_attn.qkv_proj.weight shape: [9216, 3072]
+116: model.layers.26.input_layernorm.weight shape: [3072]
+117: model.layers.26.mlp.down_proj.weight shape: [3072, 8192]
+118: model.layers.26.mlp.gate_up_proj.weight shape: [16384, 3072]
+119: model.layers.26.post_attention_layernorm.weight shape: [3072]
+120: model.layers.26.self_attn.o_proj.weight shape: [3072, 3072]
+121: model.layers.26.self_attn.qkv_proj.weight shape: [9216, 3072]
+122: model.layers.27.input_layernorm.weight shape: [3072]
+123: model.layers.27.mlp.down_proj.weight shape: [3072, 8192]
+124: model.layers.27.mlp.gate_up_proj.weight shape: [16384, 3072]
+125: model.layers.27.post_attention_layernorm.weight shape: [3072]
+126: model.layers.27.self_attn.o_proj.weight shape: [3072, 3072]
+127: model.layers.27.self_attn.qkv_proj.weight shape: [9216, 3072]
+128: model.layers.28.input_layernorm.weight shape: [3072]
+129: model.layers.28.mlp.down_proj.weight shape: [3072, 8192]
+130: model.layers.28.mlp.gate_up_proj.weight shape: [16384, 3072]
+131: model.layers.28.post_attention_layernorm.weight shape: [3072]
+132: model.layers.28.self_attn.o_proj.weight shape: [3072, 3072]
+133: model.layers.28.self_attn.qkv_proj.weight shape: [9216, 3072]
+134: model.layers.29.input_layernorm.weight shape: [3072]
+135: model.layers.29.mlp.down_proj.weight shape: [3072, 8192]
+136: model.layers.29.mlp.gate_up_proj.weight shape: [16384, 3072]
+137: model.layers.29.post_attention_layernorm.weight shape: [3072]
+138: model.layers.29.self_attn.o_proj.weight shape: [3072, 3072]
+139: model.layers.29.self_attn.qkv_proj.weight shape: [9216, 3072]
+140: model.layers.3.input_layernorm.weight shape: [3072]
+141: model.layers.3.mlp.down_proj.weight shape: [3072, 8192]
+142: model.layers.3.mlp.gate_up_proj.weight shape: [16384, 3072]
+143: model.layers.3.post_attention_layernorm.weight shape: [3072]
+144: model.layers.3.self_attn.o_proj.weight shape: [3072, 3072]
+145: model.layers.3.self_attn.qkv_proj.weight shape: [9216, 3072]
+146: model.layers.30.input_layernorm.weight shape: [3072]
+147: model.layers.30.mlp.down_proj.weight shape: [3072, 8192]
+148: model.layers.30.mlp.gate_up_proj.weight shape: [16384, 3072]
+149: model.layers.30.post_attention_layernorm.weight shape: [3072]
+150: model.layers.30.self_attn.o_proj.weight shape: [3072, 3072]
+151: model.layers.30.self_attn.qkv_proj.weight shape: [9216, 3072]
+152: model.layers.31.input_layernorm.weight shape: [3072]
+153: model.layers.31.mlp.down_proj.weight shape: [3072, 8192]
+154: model.layers.31.mlp.gate_up_proj.weight shape: [16384, 3072]
+155: model.layers.31.post_attention_layernorm.weight shape: [3072]
+156: model.layers.31.self_attn.o_proj.weight shape: [3072, 3072]
+157: model.layers.31.self_attn.qkv_proj.weight shape: [9216, 3072]
+158: model.layers.4.input_layernorm.weight shape: [3072]
+159: model.layers.4.mlp.down_proj.weight shape: [3072, 8192]
+160: model.layers.4.mlp.gate_up_proj.weight shape: [16384, 3072]
+161: model.layers.4.post_attention_layernorm.weight shape: [3072]
+162: model.layers.4.self_attn.o_proj.weight shape: [3072, 3072]
+163: model.layers.4.self_attn.qkv_proj.weight shape: [9216, 3072]
+164: model.layers.5.input_layernorm.weight shape: [3072]
+165: model.layers.5.mlp.down_proj.weight shape: [3072, 8192]
+166: model.layers.5.mlp.gate_up_proj.weight shape: [16384, 3072]
+167: model.layers.5.post_attention_layernorm.weight shape: [3072]
+168: model.layers.5.self_attn.o_proj.weight shape: [3072, 3072]
+169: model.layers.5.self_attn.qkv_proj.weight shape: [9216, 3072]
+170: model.layers.6.input_layernorm.weight shape: [3072]
+171: model.layers.6.mlp.down_proj.weight shape: [3072, 8192]
+172: model.layers.6.mlp.gate_up_proj.weight shape: [16384, 3072]
+173: model.layers.6.post_attention_layernorm.weight shape: [3072]
+174: model.layers.6.self_attn.o_proj.weight shape: [3072, 3072]
+175: model.layers.6.self_attn.qkv_proj.weight shape: [9216, 3072]
+176: model.layers.7.input_layernorm.weight shape: [3072]
+177: model.layers.7.mlp.down_proj.weight shape: [3072, 8192]
+178: model.layers.7.mlp.gate_up_proj.weight shape: [16384, 3072]
+179: model.layers.7.post_attention_layernorm.weight shape: [3072]
+180: model.layers.7.self_attn.o_proj.weight shape: [3072, 3072]
+181: model.layers.7.self_attn.qkv_proj.weight shape: [9216, 3072]
+182: model.layers.8.input_layernorm.weight shape: [3072]
+183: model.layers.8.mlp.down_proj.weight shape: [3072, 8192]
+184: model.layers.8.mlp.gate_up_proj.weight shape: [16384, 3072]
+185: model.layers.8.post_attention_layernorm.weight shape: [3072]
+186: model.layers.8.self_attn.o_proj.weight shape: [3072, 3072]
+187: model.layers.8.self_attn.qkv_proj.weight shape: [9216, 3072]
+188: model.layers.9.input_layernorm.weight shape: [3072]
+189: model.layers.9.mlp.down_proj.weight shape: [3072, 8192]
+190: model.layers.9.mlp.gate_up_proj.weight shape: [16384, 3072]
+191: model.layers.9.post_attention_layernorm.weight shape: [3072]
+192: model.layers.9.self_attn.o_proj.weight shape: [3072, 3072]
+193: model.layers.9.self_attn.qkv_proj.weight shape: [9216, 3072]
+194: model.norm.weight shape: [3072]
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.TokenizerTest.approved.txt b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.TokenizerTest.approved.txt
new file mode 100644
index 0000000000..70624d24df
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Approvals/Phi3Tests.TokenizerTest.approved.txt
@@ -0,0 +1,20 @@
+<s>Can you provide ways to eat combinations of bananas and dragonfruits?
+1, 1815, 366, 3867, 5837, 304, 17545, 18240, 310, 9892, 16397, 322, 8338, 265, 29888, 21211, 29973
+<s>Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey.
+1, 18585, 29991, 2266, 526, 777, 5837, 304, 17545, 9892, 16397, 322, 8338, 265, 29888, 21211, 4208, 29901, 29871, 29896, 29889, 10765, 1648, 322, 8338, 265, 29888, 9216, 10597, 347, 29901, 3164, 355, 9892, 16397, 322, 8338, 265, 29888, 21211, 4208, 411, 777, 27274, 322, 298, 4992, 29889, 29871, 29906, 29889, 10765, 1648, 322, 8338, 265, 29888, 9216, 4497, 328, 29901, 23478, 269, 506, 287, 9892, 16397, 322, 8338, 265, 29888, 21211, 4208, 411, 777, 454, 3712, 3623, 625, 322, 298, 4992, 29889
+<s>What about solving an 2x + 3 = 7 equation?
+1, 1724, 1048, 17069, 385, 29871, 29906, 29916, 718, 29871, 29941, 353, 29871, 29955, 6306, 29973
+<s>
+Count to 3
+
+1, 29871, 13, 3981, 304, 29871, 29941, 13
+<s><|user|>
+1, 32010
+<s><|end|>
+1, 32007
+<s><|assistant|>
+1, 32001
+<s><|user|>
+Count to 3<|end|>
+<|assistant|>
+1, 32010, 29871, 13, 3981, 304, 29871, 29941, 32007, 13, 32001
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/AutoGenTests.cs b/test/Microsoft.ML.GenAI.Phi.Tests/AutoGenTests.cs
new file mode 100644
index 0000000000..33ab565fe7
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/AutoGenTests.cs
@@ -0,0 +1,40 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using AutoGen.Core;
+using FluentAssertions;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.Tokenizers;
+using Moq;
+using Xunit;
+
+namespace Microsoft.ML.GenAI.Phi.Tests;
+
+public class AutoGenTests
+{
+    [Fact]
+    public async Task ItGenerateTextReply()
+    {
+        var pipeline = Mock.Of<ICausalLMPipeline<Tokenizer, Phi3ForCasualLM>>();
+        // mock generate api
+        Mock.Get(pipeline).Setup(p => p.Generate(
+            It.IsAny<string>(), // prompt
+            It.IsAny<int>(),    // max length
+            It.IsAny<float>(),  // temperature 
+            It.IsAny<float>(),  // top_p
+            It.IsAny<string[]>()))   // stop sequence
+            .Callback((string prompt, int maxLen, float temperature, float topP, string[] stopSequences) =>
+            {
+                // check prompt
+                prompt.Should().Be("<|system|>\nyou are a helpful assistant<|end|>\n<|user|>\nhey<|end|>\n<|assistant|>");
+            })
+            .Returns((string prompt, int maxLen, float temperature, float topP, string[] stopSequences) => "hello");
+
+        var agent = new Phi3Agent(pipeline, "assistant");
+        var reply = await agent.SendAsync("hey");
+
+        reply.GetContent().Should().Be("hello");
+        reply.From.Should().Be(agent.Name);
+    }
+}
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Microsoft.ML.GenAI.Phi.Tests.csproj b/test/Microsoft.ML.GenAI.Phi.Tests/Microsoft.ML.GenAI.Phi.Tests.csproj
new file mode 100644
index 0000000000..dbe744ab12
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Microsoft.ML.GenAI.Phi.Tests.csproj
@@ -0,0 +1,43 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFrameworks>net6.0</TargetFrameworks>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <NoWarn>$(NoWarn);MSML_ExtendBaseTestClass</NoWarn>
+    <Nullable>enable</Nullable>
+    <PreserveCompilationContext>true</PreserveCompilationContext>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\src\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.GenAI.Phi\Microsoft.ML.GenAI.Phi.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <PackageReference Include="ApprovalTests" Version="$(ApprovalTestsVersion)" />
+    <PackageReference Include="FluentAssertions" Version="$(FluentAssertionVersion)" />
+    <PackageReference Include="Microsoft.SemanticKernel" Version="$(SemanticKernelVersion)" />
+    <PackageReference Include="Moq" Version="$(MoqVersion)" />
+    <PackageReference Include="Microsoft.ML.TestTokenizers" Version="$(MicrosoftMLTestTokenizersVersion)" />
+    <PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <None Update="Approvals\**\*">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+
+  <ItemGroup Condition="'$(TargetArchitecture)' != 'x64'">
+    <Compile Remove="Phi2Tests.cs" />
+    <Compile Remove="Phi3Tests.cs" />
+  </ItemGroup>
+
+  <ItemGroup Condition="'$(TargetArchitecture)' == 'x64'">
+    <PackageReference Include="libtorch-cpu-win-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Windows')) AND '$(TargetArchitecture)' == 'x64'" />
+    <PackageReference Include="libtorch-cpu-linux-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('Linux')) AND '$(TargetArchitecture)' == 'x64'" />
+    <PackageReference Include="libtorch-cpu-osx-x64" Version="$(LibTorchVersion)" Condition="$([MSBuild]::IsOSPlatform('OSX')) AND '$(TargetArchitecture)' == 'x64'" />
+  </ItemGroup>
+
+</Project>
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Phi2Tests.cs b/test/Microsoft.ML.GenAI.Phi.Tests/Phi2Tests.cs
new file mode 100644
index 0000000000..33402e73a0
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Phi2Tests.cs
@@ -0,0 +1,63 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Text;
+using ApprovalTests;
+using ApprovalTests.Namers;
+using ApprovalTests.Reporters;
+using FluentAssertions;
+using Microsoft.ML.GenAI.Core.Extension;
+using Microsoft.ML.Tokenizers;
+using TorchSharp;
+using Xunit;
+namespace Microsoft.ML.GenAI.Phi.Tests;
+
+[Collection("NoParallelization")]
+public class Phi2Tests
+{
+    public Phi2Tests()
+    {
+        if (Environment.GetEnvironmentVariable("HELIX_CORRELATION_ID") != null)
+        {
+            Approvals.UseAssemblyLocationForApprovedFiles();
+        }
+        torch.set_default_device("meta");
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void LoadSafeTensorShapeTest()
+    {
+        var model = new Phi2ForCasualLM(Phi2Config.Phi2);
+        var stateDictStr = model.PeekShape();
+        Approvals.Verify(stateDictStr);
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void TokenizerTest()
+    {
+        var modelWeightFolder = Path.Join("Phi-2");
+        var tokenizer = Phi2TokenizerHelper.Create(modelWeightFolder, addBeginOfSentence: true);
+        tokenizer.EndOfSentenceId.Should().Be(50256);
+        tokenizer.BeginningOfSentenceId.Should().Be(50256);
+        var messages = new string[]
+        {
+            "Can you provide ways to eat combinations of bananas and dragonfruits?",
+            "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey.",
+            "What about solving an 2x + 3 = 7 equation?"
+        };
+        var sb = new StringBuilder();
+        foreach (var message in messages)
+        {
+            var tokenized = tokenizer.EncodeToIds(message, true, false);
+            var tokenizedStr = string.Join(", ", tokenized.Select(x => x.ToString()));
+
+            sb.AppendLine(tokenizedStr);
+        }
+        Approvals.Verify(sb.ToString());
+    }
+}
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/Phi3Tests.cs b/test/Microsoft.ML.GenAI.Phi.Tests/Phi3Tests.cs
new file mode 100644
index 0000000000..1200d79f9d
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/Phi3Tests.cs
@@ -0,0 +1,151 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Text;
+using System.Text.Json;
+using ApprovalTests;
+using ApprovalTests.Namers;
+using ApprovalTests.Reporters;
+using FluentAssertions;
+using Microsoft.ML.GenAI.Core.Extension;
+using TorchSharp;
+using Xunit;
+
+namespace Microsoft.ML.GenAI.Phi.Tests;
+
+[Collection("NoParallelization")]
+public class Phi3Tests
+{
+    public Phi3Tests()
+    {
+        if (Environment.GetEnvironmentVariable("HELIX_CORRELATION_ID") != null)
+        {
+            Approvals.UseAssemblyLocationForApprovedFiles();
+        }
+
+        torch.set_default_device("meta");
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void Phi3Mini4KShapeTest()
+    {
+        var model = new Phi3ForCasualLM(Phi3Config.Phi3Mini4kInstruct);
+        var stateDictStr = model.PeekShape();
+        Approvals.Verify(stateDictStr);
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void Phi3Mini4KInt8QuantizeShapeTest()
+    {
+        var model = new Phi3ForCasualLM(Phi3Config.Phi3Mini4kInstruct);
+        model.ToInt8QuantizeModule();
+        var size = model.GetSizeInBytes();
+        var stateDictStr = model.PeekShape();
+        var sizeInGB = size / 1024 / 1024 / 1024;
+        sizeInGB.Should().Be(3);
+        Approvals.Verify(stateDictStr);
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void Phi3Mini4KInt4QuantizeShapeTest()
+    {
+        var model = new Phi3ForCasualLM(Phi3Config.Phi3Mini4kInstruct);
+        model.ToInt4QuantizeModule();
+        var size = model.GetSizeInBytes();
+        var stateDictStr = model.PeekShape();
+        var sizeInGB = size / 1024 / 1024 / 1024;
+        sizeInGB.Should().Be(2);
+        Approvals.Verify(stateDictStr);
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void Phi3Medium4KShapeTest()
+    {
+        var model = new Phi3ForCasualLM(Phi3Config.Phi3Medium4kInstruct);
+        var stateDictStr = model.PeekShape();
+        Approvals.Verify(stateDictStr);
+    }
+
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void Phi3Medium128KShapeTest()
+    {
+        var model = new Phi3ForCasualLM(Phi3Config.Phi3Medium128kInstruct);
+        var stateDictStr = model.PeekShape();
+        Approvals.Verify(stateDictStr);
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void Phi3Mini128KShapeTest()
+    {
+        var model = new Phi3ForCasualLM(Phi3Config.Phi3Mini128kInstruct);
+        var stateDictStr = model.PeekShape();
+        Approvals.Verify(stateDictStr);
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void Phi3Mini128KLayerSizeTest()
+    {
+        var model = new Phi3ForCasualLM(Phi3Config.Phi3Mini128kInstruct);
+        var size = model.GetSizeForEachDynamicLayerInBytes();
+        // convert size to MB
+        var sizeInMB = size.ToDictionary(x => x.Key, x => x.Value / 1024 / 1024);
+
+        var json = JsonSerializer.Serialize(sizeInMB, new JsonSerializerOptions { WriteIndented = true });
+        Approvals.Verify(json);
+    }
+
+    [Fact]
+    [UseReporter(typeof(DiffReporter))]
+    [UseApprovalSubdirectory("Approvals")]
+    public void TokenizerTest()
+    {
+        var modelWeightFolder = Path.Join("Llama");
+        var modelPath = Path.Join(modelWeightFolder, "tokenizer.model");
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(modelPath);
+        tokenizer.BeginningOfSentenceId.Should().Be(1);
+        tokenizer.EndOfSentenceId.Should().Be(2);
+
+        // test <|end|>
+        var endIds = tokenizer.EncodeToIds("<|end|>", addBeginningOfSentence: false, addEndOfSentence: false, considerPreTokenization: false, considerNormalization: false);
+        endIds.Should().BeEquivalentTo(new int[] { 32007 });
+
+        var messages = new string[]
+        {
+            "Can you provide ways to eat combinations of bananas and dragonfruits?",
+            "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey.",
+            "What about solving an 2x + 3 = 7 equation?",
+            "\nCount to 3\n",
+            "<|user|>",
+            "<|end|>",
+            "<|assistant|>",
+            "<|user|>\nCount to 3<|end|>\n<|assistant|>",
+        };
+        var sb = new StringBuilder();
+        foreach (var message in messages)
+        {
+            var tokenizeIds = tokenizer.EncodeToIds(message, true, false, considerPreTokenization: true);
+            var decodeToString = tokenizer.Decode(tokenizeIds, considerSpecialTokens: true);
+            sb.AppendLine(decodeToString);
+            var tokenizedStr = string.Join(", ", tokenizeIds.Select(x => x.ToString()));
+
+            sb.AppendLine(tokenizedStr);
+        }
+        Approvals.Verify(sb.ToString());
+    }
+}
diff --git a/test/Microsoft.ML.GenAI.Phi.Tests/SemanticKernelTests.cs b/test/Microsoft.ML.GenAI.Phi.Tests/SemanticKernelTests.cs
new file mode 100644
index 0000000000..98359a8722
--- /dev/null
+++ b/test/Microsoft.ML.GenAI.Phi.Tests/SemanticKernelTests.cs
@@ -0,0 +1,79 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using FluentAssertions;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Phi.Extension;
+using Microsoft.ML.Tokenizers;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.ChatCompletion;
+using Moq;
+using Xunit;
+
+namespace Microsoft.ML.GenAI.Phi.Tests;
+
+public class SemanticKernelTests
+{
+    [Fact]
+    public async Task ItAddPhi3CausalLMChatCompletionServiceTestAsync()
+    {
+        var pipeline = Mock.Of<ICausalLMPipeline<Tokenizer, Phi3ForCasualLM>>();
+        // mock generate api
+        Mock.Get(pipeline).Setup(p => p.Generate(
+            It.IsAny<string>(), // prompt
+            It.IsAny<int>(),    // max length
+            It.IsAny<float>(),  // temperature 
+            It.IsAny<float>(),  // top_p
+            It.IsAny<string[]>()))   // stop sequence
+            .Callback((string prompt, int maxLen, float temperature, float topP, string[] stopSequences) =>
+            {
+                // check prompt
+                prompt.Should().Be("<|system|>\nyou are a helpful assistant<|end|>\n<|user|>\nhey<|end|>\n<|assistant|>");
+            })
+            .Returns((string prompt, int maxLen, float temperature, float topP, string[] stopSequences) => "hello");
+
+        var kernel = Kernel.CreateBuilder()
+            .AddGenAIChatCompletion(pipeline)
+            .Build();
+
+        var chatService = kernel.Services.GetRequiredService<IChatCompletionService>();
+
+        var chatHistory = new ChatHistory();
+        chatHistory.AddSystemMessage("you are a helpful assistant");
+        chatHistory.AddUserMessage("hey");
+        var responses = await chatService.GetChatMessageContentsAsync(chatHistory);
+        responses.Count().Should().Be(1);
+        var response = responses.First();
+        response.Role.Should().Be(AuthorRole.Assistant);
+        response.Items.Count().Should().Be(1);
+        var textContent = response.Items.First() as TextContent;
+        textContent!.Text.Should().Be("hello");
+    }
+
+    [Fact]
+    public async Task ItAddPhi3CausalLMTextGenerationServiceTestAsync()
+    {
+        var pipeline = Mock.Of<ICausalLMPipeline<Tokenizer, Phi3ForCasualLM>>();
+        // mock generate api
+        Mock.Get(pipeline).Setup(p => p.Generate(
+            It.IsAny<string>(), // prompt
+            It.IsAny<int>(),    // max length
+            It.IsAny<float>(),  // temperature 
+            It.IsAny<float>(),  // top_p
+            It.IsAny<string[]>()))   // stop sequence
+            .Callback((string prompt, int maxLen, float temperature, float topP, string[] stopSequences) =>
+            {
+                // check prompt
+                prompt.Should().Be("test");
+            })
+            .Returns((string prompt, int maxLen, float temperature, float topP, string[] stopSequences) => "hello");
+
+        var kernel = Kernel.CreateBuilder()
+            .AddGenAITextGeneration(pipeline)
+            .Build();
+
+        var response = await kernel.InvokePromptAsync("test");
+    }
+}