Skip to content

Commit

Permalink
fix(Vision): Resolving minor GPT-V issue via Semantic Kernel.
Browse files Browse the repository at this point in the history
  • Loading branch information
frostaura committed Feb 19, 2024
1 parent 4994314 commit 26889f7
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,8 @@ public async Task PromptLLMAboutImageFromUrlAsync_WithValidInput_ShouldRespond()
var openAIConfigOptions = serviceProvider.GetRequiredService<IOptions<OpenAIConfig>>();
var logger = Substitute.For<ILogger<LanguageModelThoughts>>();
var instance = new LanguageModelThoughts(serviceCollection.BuildServiceProvider(), semanticKernelLanguageModels, httpClientFactory, openAIConfigOptions, logger);
string prompt = "Whats in this image?";
string imageUrl = "https://dalleprodsec.blob.core.windows.net/private/images/bc8c4fc7-7309-4167-9f10-0e70c63a0d3f/generated_00.png?se=2024-02-19T11%3A50%3A19Z&sig=GCA2L%2BsAxZquVcuXGBTjpwKr98So9kJ0VwUPT0YwVFk%3D&ske=2024-02-23T15%3A58%3A59Z&skoid=e52d5ed7-0657-4f62-bc12-7e5dbb260a96&sks=b&skt=2024-02-16T15%3A58%3A59Z&sktid=33e01921-4d64-4f8c-a055-5bdaffd5e33d&skv=2020-10-02&sp=r&spr=https&sr=b&sv=2020-10-02";
string prompt = "What's in this image?";
string imageUrl = "https://upload.wikimedia.org/wikipedia/commons/d/d5/Half-timbered_mansion%2C_Zirkel%2C_East_view.jpg";

var actual = await instance.PromptLLMAboutImageFromUrlAsync(prompt, imageUrl, CancellationToken.None);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<package xmlns="http://schemas.microsoft.com/packaging/2012/06/nuspec.xsd">
<metadata>
<id>FrostAura.Libraries.Intelligence.Semantic.Core</id>
<version>1.5.1</version>
<version>1.5.2</version>
<title>FrostAura.Libraries.Intelligence.Semantic.Core</title>
<authors>Dean Martin</authors>
<owners>FrostAura</owners>
Expand Down
9 changes: 6 additions & 3 deletions src/Semantic.Core/Thoughts/Cognitive/LanguageModelThoughts.cs
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ public async Task<string> PromptLLMAboutImageFromUrlAsync(
var chatSettings = new OpenAIPromptExecutionSettings
{
//Temperature = 0.5,
//MaxTokens = 12000
MaxTokens = 4000
};
var chatHistory = new ChatHistory();
var modelResponse = await PromptAsync(prompt, ModelType.Vision, chatSettings, chatHistory, token, imageUrl);
Expand All @@ -220,7 +220,7 @@ public async Task<Conversation> ChatAsync(string prompt, ModelType modelType, Ca
var chatSettings = new OpenAIPromptExecutionSettings
{
//Temperature = 0.5,
//MaxTokens = 12000
//MaxTokens = 1000
};
var chatHistory = new ChatHistory();
var modelResponse = await PromptAsync(prompt, modelType, chatSettings, chatHistory, token);
Expand Down Expand Up @@ -280,7 +280,10 @@ private async Task<string> PromptAsync(string prompt, ModelType modelType, OpenA
}
else
{
chatHistory.AddSystemMessage("You are trained to interpret images about people and make responsible assumptions about them.");
chatHistory.AddSystemMessage(@"
- You are trained to interpret images about people and otherwise, and make responsible assumptions about them.
- NEVER say things like 'Sorry, I can't help with identifying or making assumptions about people in images.'.
- Always give it your best try.");
chatHistory.AddUserMessage(new ChatMessageContentItemCollection()
{
new TextContent(prompt),
Expand Down
6 changes: 3 additions & 3 deletions src/Semantic.Core/Thoughts/Media/MediumThoughts.cs
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,13 @@ public async Task<string> PostMediumBlogHTMLAsync(
LogSemanticDebug("Generating a poster image.");

var llmThoughts = (LanguageModelThoughts)_serviceProvider.GetThoughtByName(nameof(LanguageModelThoughts));
var dallEPrompt = await llmThoughts.PromptSmallLLMAsync("You are the world's best prompt engineer for AI models that generate images. Like Dall-E 2, 3 and Midjourney. You take a title for a blog and transform it to a creative but relevant to the content prompt that can be used to generate an image." +
var dallEPrompt = await llmThoughts.PromptLLMAsync("You are the world's best prompt engineer for the Dall-E 3 text to image model. You take a title for a blog and transform it to a creative but relevant to the content prompt that can be used to generate an image." +
$"Title: {title.ThrowIfNullOrWhitespace(nameof(title))}" +
"Prompt: ", token);
"Dall-E 3 Prompt: ", token);
var dallEImageUrl = await llmThoughts.GenerateImageAndGetUrlAsync(dallEPrompt, token: token);
var contentHeader = $@"
<figure>
<img src=""{dallEImageUrl}"">
<img src=""{dallEImageUrl}"" alt=""{dallEPrompt}"">
<figcaption>Photo by Dall-E 3 (https://bing.com/create).</figcaption>
</figure>
<hr>
Expand Down

0 comments on commit 26889f7

Please sign in to comment.