fix(Vision): Resolving minor GPT-V issue via Semantic Kernel.

frostaura · Feb 19, 2024 · 26889f7 · 26889f7
1 parent 4994314
commit 26889f7
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 9 deletions.
diff --git a/src/Semantic.Core.Tests/Thoughts/Cognitive/LanguageModelThoughtsTests.cs b/src/Semantic.Core.Tests/Thoughts/Cognitive/LanguageModelThoughtsTests.cs
@@ -264,8 +264,8 @@ public async Task PromptLLMAboutImageFromUrlAsync_WithValidInput_ShouldRespond()
         var openAIConfigOptions = serviceProvider.GetRequiredService<IOptions<OpenAIConfig>>();
         var logger = Substitute.For<ILogger<LanguageModelThoughts>>();
         var instance = new LanguageModelThoughts(serviceCollection.BuildServiceProvider(), semanticKernelLanguageModels, httpClientFactory, openAIConfigOptions, logger);
-        string prompt = "What’s in this image?";
-        string imageUrl = "https://dalleprodsec.blob.core.windows.net/private/images/bc8c4fc7-7309-4167-9f10-0e70c63a0d3f/generated_00.png?se=2024-02-19T11%3A50%3A19Z&sig=GCA2L%2BsAxZquVcuXGBTjpwKr98So9kJ0VwUPT0YwVFk%3D&ske=2024-02-23T15%3A58%3A59Z&skoid=e52d5ed7-0657-4f62-bc12-7e5dbb260a96&sks=b&skt=2024-02-16T15%3A58%3A59Z&sktid=33e01921-4d64-4f8c-a055-5bdaffd5e33d&skv=2020-10-02&sp=r&spr=https&sr=b&sv=2020-10-02";
+        string prompt = "What's in this image?";
+        string imageUrl = "https://upload.wikimedia.org/wikipedia/commons/d/d5/Half-timbered_mansion%2C_Zirkel%2C_East_view.jpg";
 
         var actual = await instance.PromptLLMAboutImageFromUrlAsync(prompt, imageUrl, CancellationToken.None);
 

diff --git a/src/Semantic.Core/FrostAura.Libraries.Intelligence.Semantic.Core.nuspec b/src/Semantic.Core/FrostAura.Libraries.Intelligence.Semantic.Core.nuspec
@@ -2,7 +2,7 @@
 <package xmlns="http://schemas.microsoft.com/packaging/2012/06/nuspec.xsd">
     <metadata>
         <id>FrostAura.Libraries.Intelligence.Semantic.Core</id>
-        <version>1.5.1</version>
+        <version>1.5.2</version>
         <title>FrostAura.Libraries.Intelligence.Semantic.Core</title>
         <authors>Dean Martin</authors>
         <owners>FrostAura</owners>

diff --git a/src/Semantic.Core/Thoughts/Cognitive/LanguageModelThoughts.cs b/src/Semantic.Core/Thoughts/Cognitive/LanguageModelThoughts.cs
@@ -194,7 +194,7 @@ public async Task<string> PromptLLMAboutImageFromUrlAsync(
             var chatSettings = new OpenAIPromptExecutionSettings
             {
                 //Temperature = 0.5,
-                //MaxTokens = 12000
+                MaxTokens = 4000
             };
             var chatHistory = new ChatHistory();
             var modelResponse = await PromptAsync(prompt, ModelType.Vision, chatSettings, chatHistory, token, imageUrl);
@@ -220,7 +220,7 @@ public async Task<Conversation> ChatAsync(string prompt, ModelType modelType, Ca
             var chatSettings = new OpenAIPromptExecutionSettings
             {
                 //Temperature = 0.5,
-                //MaxTokens = 12000
+                //MaxTokens = 1000
             };
             var chatHistory = new ChatHistory();
             var modelResponse = await PromptAsync(prompt, modelType, chatSettings, chatHistory, token);
@@ -280,7 +280,10 @@ private async Task<string> PromptAsync(string prompt, ModelType modelType, OpenA
             }
             else
             {
-                chatHistory.AddSystemMessage("You are trained to interpret images about people and make responsible assumptions about them.");
+                chatHistory.AddSystemMessage(@"
+                    - You are trained to interpret images about people and otherwise, and make responsible assumptions about them.
+                    - NEVER say things like 'Sorry, I can't help with identifying or making assumptions about people in images.'.
+                    - Always give it your best try.");
                 chatHistory.AddUserMessage(new ChatMessageContentItemCollection()
                 {
                     new TextContent(prompt),

diff --git a/src/Semantic.Core/Thoughts/Media/MediumThoughts.cs b/src/Semantic.Core/Thoughts/Media/MediumThoughts.cs
@@ -84,13 +84,13 @@ public async Task<string> PostMediumBlogHTMLAsync(
                     LogSemanticDebug("Generating a poster image.");
 
                     var llmThoughts = (LanguageModelThoughts)_serviceProvider.GetThoughtByName(nameof(LanguageModelThoughts));
-                    var dallEPrompt = await llmThoughts.PromptSmallLLMAsync("You are the world's best prompt engineer for AI models that generate images. Like Dall-E 2, 3 and Midjourney. You take a title for a blog and transform it to a creative but relevant to the content prompt that can be used to generate an image." +
+                    var dallEPrompt = await llmThoughts.PromptLLMAsync("You are the world's best prompt engineer for the Dall-E 3 text to image model. You take a title for a blog and transform it to a creative but relevant to the content prompt that can be used to generate an image." +
                         $"Title: {title.ThrowIfNullOrWhitespace(nameof(title))}" +
-                        "Prompt: ", token);
+                        "Dall-E 3 Prompt: ", token);
                     var dallEImageUrl = await llmThoughts.GenerateImageAndGetUrlAsync(dallEPrompt, token: token);
                     var contentHeader = $@"
                     <figure>
-                      <img src=""{dallEImageUrl}"">
+                      <img src=""{dallEImageUrl}"" alt=""{dallEPrompt}"">
                       <figcaption>Photo by Dall-E 3 (https://bing.com/create).</figcaption>
                     </figure>
                     <hr>