Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AI Evaluation tests for eShopSupport. #49

Merged
merged 6 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .config/dotnet-tools.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"version": 1,
"isRoot": true,
"tools": {
"microsoft.extensions.ai.evaluation.console": {
"version": "0.9.5-preview",
"commands": [
"aieval"
],
"rollForward": false
}
}
}
48 changes: 28 additions & 20 deletions Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
<CentralPackageTransitivePinningEnabled>true</CentralPackageTransitivePinningEnabled>
<MicrosoftExtensionsVersion>8.6.0</MicrosoftExtensionsVersion>
<AspireVersion>8.2.0</AspireVersion>
<MicrosoftExtensionsAiVersion>9.0.0-preview.9.24507.7</MicrosoftExtensionsAiVersion>
<AspireVersion>8.2.2</AspireVersion>
<MicrosoftExtensionsAiVersion>9.0.0-preview.9.24556.5</MicrosoftExtensionsAiVersion>
<MicrosoftExtensionsAiEvaluationVersion>0.9.5-preview</MicrosoftExtensionsAiEvaluationVersion>
</PropertyGroup>
<ItemGroup>
<!-- Version together with Aspire -->
Expand All @@ -17,45 +18,52 @@
<PackageVersion Include="Aspire.Hosting.Testing" Version="$(AspireVersion)" />
<PackageVersion Include="Aspire.Npgsql.EntityFrameworkCore.PostgreSQL" Version="$(AspireVersion)" />
<PackageVersion Include="Aspire.StackExchange.Redis" Version="$(AspireVersion)" />
<PackageVersion Include="Azure.Identity" Version="1.12.0" />
<PackageVersion Include="Azure.AI.OpenAI" Version="2.1.0-beta.1" />
<PackageVersion Include="IdentityModel" Version="7.0.0" />
<PackageVersion Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="8.0.7" />
<PackageVersion Include="Microsoft.AspNetCore.Authentication.OpenIdConnect" Version="8.0.7" />
<PackageVersion Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="8.0.10" />
<PackageVersion Include="Microsoft.AspNetCore.Authentication.OpenIdConnect" Version="8.0.10" />
<PackageVersion Include="Microsoft.Extensions.AI" Version="$(MicrosoftExtensionsAiVersion)" />
<PackageVersion Include="Microsoft.Extensions.AI.Evaluation" Version="$(MicrosoftExtensionsAiEvaluationVersion)" />
<PackageVersion Include="Microsoft.Extensions.AI.Evaluation.Quality" Version="$(MicrosoftExtensionsAiEvaluationVersion)" />
<PackageVersion Include="Microsoft.Extensions.AI.Evaluation.Reporting" Version="$(MicrosoftExtensionsAiEvaluationVersion)" />
<PackageVersion Include="Microsoft.Extensions.AI.Ollama" Version="$(MicrosoftExtensionsAiVersion)" />
<PackageVersion Include="Microsoft.Extensions.AI.OpenAI" Version="$(MicrosoftExtensionsAiVersion)" />
<PackageVersion Include="Microsoft.Extensions.Hosting" Version="8.0.0" />
<PackageVersion Include="Microsoft.Extensions.Hosting" Version="8.0.1" />
<PackageVersion Include="Microsoft.Extensions.ServiceDiscovery" Version="$(AspireVersion)" />
<PackageVersion Include="Microsoft.ML.Tokenizers" Version="0.22.0" />
<PackageVersion Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="0.22.0" />
<!-- Version together with ASP.NET -->
<PackageVersion Include="Microsoft.AspNetCore.Components.QuickGrid" Version="8.0.7" />
<PackageVersion Include="Microsoft.Extensions.Configuration.Json" Version="8.0.0" />
<PackageVersion Include="Microsoft.Extensions.Http.Resilience" Version="8.8.0" />
<PackageVersion Include="Microsoft.FluentUI.AspNetCore.Components" Version="4.9.3" />
<PackageVersion Include="Microsoft.FluentUI.AspNetCore.Components.DataGrid.EntityFrameworkAdapter" Version="4.9.3" />
<PackageVersion Include="Microsoft.FluentUI.AspNetCore.Components.Icons" Version="4.9.3" />
<PackageVersion Include="Microsoft.Playwright" Version="1.45.0" />
<PackageVersion Include="Microsoft.AspNetCore.Components.QuickGrid" Version="8.0.10" />
<PackageVersion Include="Microsoft.Extensions.Configuration.Json" Version="8.0.1" />
<PackageVersion Include="Microsoft.Extensions.Http.Resilience" Version="8.10.0" />
<PackageVersion Include="Microsoft.FluentUI.AspNetCore.Components" Version="4.10.3" />
<PackageVersion Include="Microsoft.FluentUI.AspNetCore.Components.DataGrid.EntityFrameworkAdapter" Version="4.10.3" />
<PackageVersion Include="Microsoft.FluentUI.AspNetCore.Components.Icons" Version="4.10.3" />
<PackageVersion Include="Microsoft.Playwright" Version="1.48.0" />
<PackageVersion Include="Microsoft.SemanticKernel.Connectors.OpenAI" Version="1.16.0" />
<PackageVersion Include="Microsoft.SemanticKernel.Connectors.Qdrant" Version="1.16.0-alpha" />
<PackageVersion Include="Microsoft.SemanticKernel.Core" Version="1.16" />
<PackageVersion Include="Microsoft.SemanticKernel.Core" Version="1.25.0" />
<PackageVersion Include="Microsoft.SemanticKernel" Version="1.16.0" />
<!-- Open Telemetry -->
<PackageVersion Include="OpenTelemetry.Exporter.OpenTelemetryProtocol" Version="1.9.0" />
<PackageVersion Include="OpenTelemetry.Extensions.Hosting" Version="1.9.0" />
<PackageVersion Include="OpenTelemetry.Instrumentation.AspNetCore" Version="1.9.0" />
<PackageVersion Include="OpenTelemetry.Instrumentation.Http" Version="1.9.0" />
<PackageVersion Include="OpenTelemetry.Instrumentation.Runtime" Version="1.9.0" />
<PackageVersion Include="PdfPig" Version="0.1.9-alpha-20240702-65c64" />
<PackageVersion Include="PdfPig" Version="0.1.9" />
<PackageVersion Include="SmartComponents.AspNetCore" Version="0.1.0-preview10148" />
<PackageVersion Include="SmartComponents.LocalEmbeddings" Version="0.1.0-preview10148" />
<PackageVersion Include="SmartComponents.LocalEmbeddings.SemanticKernel" Version="0.1.0-preview10148" />
<PackageVersion Include="StatefulReconnection" Version="0.1.0" />
<PackageVersion Include="Markdown2Pdf" Version="2.2.1" />
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.11.0-release-24352-06" />
<PackageVersion Include="System.Text.Json" Version="9.0.0-rc.2.24473.5" />
<PackageVersion Include="xunit" Version="2.9.0" />
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.11.1" />
<PackageVersion Include="System.Text.Json" Version="9.0.0" />
<PackageVersion Include="xunit" Version="2.9.2" />
<PackageVersion Include="xunit.runner.visualstudio" Version="2.8.2" />
<PackageVersion Include="Duende.IdentityServer" Version="7.0.6" />
<PackageVersion Include="Serilog.AspNetCore" Version="8.0.2-dev-00341" />
<PackageVersion Include="Duende.IdentityServer" Version="7.0.8" />
<PackageVersion Include="Serilog.AspNetCore" Version="8.0.3" />
<PackageVersion Include="System.Runtime.Caching" Version="8.0.1" />
<PackageVersion Include="System.Memory.Data" Version="8.0.1" />
</ItemGroup>
</Project>
</Project>
7 changes: 7 additions & 0 deletions eShopSupport.sln
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "E2ETest", "test\E2ETest\E2E
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "IdentityServer", "src\IdentityServer\IdentityServer.csproj", "{89404F66-90BC-4D45-9061-050334772CDC}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "EvaluationTests", "test\EvaluationTests\EvaluationTests.csproj", "{6BE0D6D5-F251-4628-9FB9-B19C565FC5EB}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -106,6 +108,10 @@ Global
{89404F66-90BC-4D45-9061-050334772CDC}.Debug|Any CPU.Build.0 = Debug|Any CPU
{89404F66-90BC-4D45-9061-050334772CDC}.Release|Any CPU.ActiveCfg = Release|Any CPU
{89404F66-90BC-4D45-9061-050334772CDC}.Release|Any CPU.Build.0 = Release|Any CPU
{6BE0D6D5-F251-4628-9FB9-B19C565FC5EB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{6BE0D6D5-F251-4628-9FB9-B19C565FC5EB}.Debug|Any CPU.Build.0 = Debug|Any CPU
{6BE0D6D5-F251-4628-9FB9-B19C565FC5EB}.Release|Any CPU.ActiveCfg = Release|Any CPU
{6BE0D6D5-F251-4628-9FB9-B19C565FC5EB}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand All @@ -124,6 +130,7 @@ Global
{9E677D76-414B-492C-AF74-1B091F940D00} = {2B17496A-5870-4676-927B-6B45871C7BBA}
{F30843C3-AFB5-435E-8A7D-9B4C86A75E18} = {51031120-B48D-4680-8808-26B64F235CB9}
{89404F66-90BC-4D45-9061-050334772CDC} = {7306A281-C46A-4EE3-948D-513D56B1BD34}
{6BE0D6D5-F251-4628-9FB9-B19C565FC5EB} = {51031120-B48D-4680-8808-26B64F235CB9}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {B2E71FD9-E3D5-450C-A9E0-318DAA986F31}
Expand Down
2 changes: 1 addition & 1 deletion seeddata/DataGenerator/Generators/TicketThreadGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ private class AssistantTools(IEmbeddingGenerator<string, Embedding<float>> embed
var chunks = SplitIntoChunks(manual.MarkdownText, 200).ToList();
var embeddings = await embedder.GenerateAsync(chunks);
var candidates = chunks.Zip(embeddings);
var queryEmbedding = (await embedder.GenerateAsync(query)).Single();
var queryEmbedding = (await embedder.GenerateAsync([query])).Single();

var closest = candidates
.Select(c => new { Text = c.First, Similarity = TensorPrimitives.CosineSimilarity(c.Second.Vector.Span, queryEmbedding.Vector.Span) })
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
using Azure.AI.OpenAI;
using System.ClientModel;
using System.ClientModel;
using System.Data.Common;
using Azure.AI.OpenAI;
using Microsoft.Extensions.AI;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using OpenAI;
using System.Data.Common;
using Microsoft.Extensions.Configuration;

namespace Microsoft.Extensions.Hosting;

Expand Down
169 changes: 169 additions & 0 deletions test/EvaluationTests/AnswerScoringEvaluator.cs
peterwald marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
// --------------------------------------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// --------------------------------------------------------------------------------------------

using System.Text.Json;

namespace Microsoft.Extensions.AI.Evaluation.Quality;

public sealed class AnswerScoringEvaluator : ChatConversationEvaluator
{

public sealed class Context(string expectedAnswer) : EvaluationContext
{
public string ExpectedAnswer { get; } = expectedAnswer;
}

const string MetricName = "Answer Score";

protected override bool IgnoresHistory => true;

public override IReadOnlyCollection<string> EvaluationMetricNames => [MetricName];

protected override EvaluationResult InitializeResult()
{
return new EvaluationResult(new NumericMetric(MetricName));
}

protected override async ValueTask<string> RenderEvaluationPromptAsync(
ChatMessage? userRequest,
ChatMessage modelResponse,
IEnumerable<ChatMessage>? includedHistory,
IEnumerable<EvaluationContext>? additionalContext,
CancellationToken token)
{
string renderedModelResponse = await this.RenderAsync(modelResponse, token);

string renderedUserRequest =
userRequest is not null
? await this.RenderAsync(userRequest, token)
: string.Empty;

string answer = "";

if (additionalContext is not null &&
additionalContext.OfType<Context>().FirstOrDefault() is Context context)
{
answer = context.ExpectedAnswer;
}
else
{
throw new InvalidOperationException($"The ExpectedAnswer must be provided in the additional context.");
}

List<string> scoreWords = ["Awful", "Poor", "Good", "Perfect"];

var prompt = $$"""
There is an AI assistant that answers questions about products sold by an online retailer. The questions
may be asked by customers or by customer support agents.

You are evaluating the quality of an AI assistant's response to several questions. Here are the
questions, the desired true answers, and the answers given by the AI system:

<questions>
<question index="0">
<text>{{renderedUserRequest}}</text>
<truth>{{answer}}</truth>
<assistantAnswer>{{renderedModelResponse}}</assistantAnswer>
</question>
</questions>

Evaluate each of the assistant's answers separately by replying in this JSON format:

{
"scores": [
{ "index": 0, "descriptionOfQuality": string, "scoreLabel": number },
{ "index": 1, "descriptionOfQuality": string, "scoreLabel": number },
... etc ...
]
]

Score only based on whether the assistant's answer is true and answers the question. As long as the
answer covers the question and is consistent with the truth, it should score as perfect. There is
no penalty for giving extra on-topic information or advice. Only penalize for missing necessary facts
or being misleading.

The descriptionOfQuality should be up to 5 words summarizing to what extent the assistant answer
is correct and sufficient.

Based on descriptionOfQuality, the scoreLabel must be a number between 1 and 5 inclusive, where 5 is best and 1 is worst.
Do not use any other words for scoreLabel. You may only pick one of those scores.

"""
;

return prompt;
}

protected override ValueTask ParseEvaluationResponseAsync(
string modelResponseForEvaluationPrompt,
EvaluationResult result,
ChatConfiguration configuration,
CancellationToken token)
{
bool hasMetric = result.TryGet<NumericMetric>(MetricName, out var numericMetric);
if (!hasMetric || numericMetric is null)
{
throw new Exception("NumericMetric was not properly initialized.");
}

var jsonOptions = new JsonSerializerOptions(JsonSerializerDefaults.Web);

var parsedResponse = JsonSerializer.Deserialize<ScoringResponse>(TrimMarkdownDelimiters(modelResponseForEvaluationPrompt), jsonOptions)!;
var score = parsedResponse.Scores.FirstOrDefault();

if (score == null)
{
numericMetric.AddDiagnostic(EvaluationDiagnostic.Error("Score was inconclusive"));
}
else
{
numericMetric.Value = score.ScoreLabel;

if (!string.IsNullOrWhiteSpace(score.DescriptionOfQuality))
{
numericMetric.AddDiagnostic(EvaluationDiagnostic.Informational(score.DescriptionOfQuality));
}
}

numericMetric.Interpretation = Interpret(numericMetric);

return new ValueTask();
}

internal static EvaluationMetricInterpretation Interpret(NumericMetric metric)
{
double score = metric?.Value ?? -1.0;
EvaluationRating rating = score switch {
1.0 => EvaluationRating.Unacceptable,
2.0 => EvaluationRating.Poor,
3.0 => EvaluationRating.Average,
4.0 => EvaluationRating.Good,
5.0 => EvaluationRating.Exceptional,
_ => EvaluationRating.Inconclusive,
};
return new EvaluationMetricInterpretation(rating, failed: rating == EvaluationRating.Inconclusive);
}

internal static ReadOnlySpan<char> TrimMarkdownDelimiters(string json)
{
#if NETSTANDARD2_0
ReadOnlySpan<char> trimmed = json.ToCharArray();
#else
ReadOnlySpan<char> trimmed = json;
#endif
trimmed = trimmed.Trim().Trim(['`']); // trim whitespace and markdown characters from beginning and end
// trim 'json' marker from markdown if it exists
if (trimmed.Length > 4 && trimmed[0..4].SequenceEqual(['j', 's', 'o', 'n']))
{
trimmed = trimmed.Slice(4);
}

return trimmed;
}


}

record ScoringResponse(AnswerScore[] Scores);
record AnswerScore(int Index, int ScoreLabel, string DescriptionOfQuality);
12 changes: 12 additions & 0 deletions test/EvaluationTests/EvalQuestion.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
namespace eShopSupport.EvaluationTests;

public class EvalQuestion
{
public int QuestionId { get; set; }

public int? ProductId { get; set; }

public required string Question { get; set; }

public required string Answer { get; set; }
}
Loading