diff --git a/src/FoundationaLLM.sln b/src/FoundationaLLM.sln index b340e7fc95..0406170311 100644 --- a/src/FoundationaLLM.sln +++ b/src/FoundationaLLM.sln @@ -11,7 +11,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Core", "dotnet\Core\Core.csproj", "{5AA7F0B6-30E6-451A-B1BE-F003BD3EC203}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SemanticKernel", "dotnet\SemanticKernel\SemanticKernel.csproj", "{503CE23D-63D7-4A26-8475-AA71A45D519B}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SemanticKernel-obsolete", "dotnet\SemanticKernel-obsolete\SemanticKernel-obsolete.csproj", "{503CE23D-63D7-4A26-8475-AA71A45D519B}" EndProject Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "LangChainAPI", "python\LangChainAPI\LangChainAPI.pyproj", "{DF3AF954-1999-4244-A783-BCE96EE17816}" EndProject @@ -81,6 +81,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Management", "dotnet\Manage EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ManagementAPI", "dotnet\ManagementAPI\ManagementAPI.csproj", "{2D54392A-8D86-4F54-9993-FB3B6C4C090E}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SemanticKernel", "dotnet\SemanticKernel\SemanticKernel.csproj", "{CDB843FE-108B-435A-BF17-68052C64F500}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -194,6 +196,10 @@ Global {2D54392A-8D86-4F54-9993-FB3B6C4C090E}.Debug|Any CPU.Build.0 = Debug|Any CPU {2D54392A-8D86-4F54-9993-FB3B6C4C090E}.Release|Any CPU.ActiveCfg = Release|Any CPU {2D54392A-8D86-4F54-9993-FB3B6C4C090E}.Release|Any CPU.Build.0 = Release|Any CPU + {CDB843FE-108B-435A-BF17-68052C64F500}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {CDB843FE-108B-435A-BF17-68052C64F500}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CDB843FE-108B-435A-BF17-68052C64F500}.Release|Any CPU.ActiveCfg = Release|Any CPU + {CDB843FE-108B-435A-BF17-68052C64F500}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -232,6 +238,7 @@ Global {6330DD34-9B05-4BD9-98E7-507134751CCA} = {23275624-C0DA-4E93-9291-081D75E8CCD2} {46FB5F1B-57C6-4CA3-B626-887DF6D806DD} = {B6DC1190-2873-44A3-85B3-63D7BDE99231} {2D54392A-8D86-4F54-9993-FB3B6C4C090E} = {B6DC1190-2873-44A3-85B3-63D7BDE99231} + {CDB843FE-108B-435A-BF17-68052C64F500} = {B6DC1190-2873-44A3-85B3-63D7BDE99231} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {FF5DE858-4B85-4EE8-8A6D-46E8E4FBA078} diff --git a/src/dotnet/Common/Constants/AppConfigurationKeys.cs b/src/dotnet/Common/Constants/AppConfigurationKeys.cs index 51035ba53f..31f306f876 100644 --- a/src/dotnet/Common/Constants/AppConfigurationKeys.cs +++ b/src/dotnet/Common/Constants/AppConfigurationKeys.cs @@ -824,5 +824,14 @@ public static class AppConfigurationKeySections /// The key section for the FoundationaLLM:Vectorization:ContentSources app configuration settings. /// public const string FoundationaLLM_Vectorization_ContentSources = "FoundationaLLM:Vectorization:ContentSources"; + /// + /// The key section for the FoundationaLLM:Vectorization:SemanticKernelTextEmbeddingService app configuration settings. + /// + public const string FoundationaLLM_Vectorization_SemanticKernelTextEmbeddingService = "FoundationaLLM:Vectorization:SemanticKernelTextEmbeddingService"; + + /// + /// The key section for the FoundationaLLM:Vectorization:AzureAISearchIndexingService app configuration settings. + /// + public const string FoundationaLLM_Vectorization_AzureAISearchIndexingService = "FoundationaLLM:Vectorization:AzureAISearchIndexingService"; } } diff --git a/src/dotnet/Common/Constants/DependencyInjectionKeys.cs b/src/dotnet/Common/Constants/DependencyInjectionKeys.cs index 8ab08cc1dd..d29ff08bcf 100644 --- a/src/dotnet/Common/Constants/DependencyInjectionKeys.cs +++ b/src/dotnet/Common/Constants/DependencyInjectionKeys.cs @@ -30,5 +30,15 @@ public static class DependencyInjectionKeys /// The dependency injection key for the content source service factory. /// public const string FoundationaLLM_Vectorization_ContentSourceServiceFactory = "FoundationaLLM:Vectorization:ContentSourceServiceFactory"; + + /// + /// The dependency injection key for the Semantic Kernel text embedding service. + /// + public const string FoundationaLLM_Vectorization_SemanticKernelTextEmbeddingService = "FoundationaLLM:Vectorization:SemanticKernelTextEmbeddingService"; + + /// + /// The dependency injection key for the Azure AI Search indexing service. + /// + public const string FoundationaLLM_Vectorization_AzureAISearchIndexingService = "FoundationaLLM:Vectorization:AzureAISearchIndexingService"; } } diff --git a/src/dotnet/Common/Exceptions/ConfigurationValueException.cs b/src/dotnet/Common/Exceptions/ConfigurationValueException.cs index d09f219bc4..a0c9bc5141 100644 --- a/src/dotnet/Common/Exceptions/ConfigurationValueException.cs +++ b/src/dotnet/Common/Exceptions/ConfigurationValueException.cs @@ -1,10 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace FoundationaLLM.Common.Exceptions +namespace FoundationaLLM.Common.Exceptions { /// /// Represents an error with a configuration value. diff --git a/src/dotnet/Common/Interfaces/IIndexingService.cs b/src/dotnet/Common/Interfaces/IIndexingService.cs new file mode 100644 index 0000000000..135735b498 --- /dev/null +++ b/src/dotnet/Common/Interfaces/IIndexingService.cs @@ -0,0 +1,18 @@ +using FoundationaLLM.Common.Models.TextEmbedding; + +namespace FoundationaLLM.Common.Interfaces +{ + /// + /// Provides indexing capabilities for embedding vectors. + /// + public interface IIndexingService + { + /// + /// Adds to a specified index the list of embeddings associated with a content. + /// + /// The containind the embeddings to index. + /// The name of the index. + /// + Task> IndexEmbeddingsAsync(EmbeddedContent embeddedContent, string indexName); + } +} diff --git a/src/dotnet/Common/Interfaces/IResourceProviderService.cs b/src/dotnet/Common/Interfaces/IResourceProviderService.cs index 03d19e4676..970cb1a103 100644 --- a/src/dotnet/Common/Interfaces/IResourceProviderService.cs +++ b/src/dotnet/Common/Interfaces/IResourceProviderService.cs @@ -26,16 +26,16 @@ public interface IResourceProviderService /// /// Gets a resource based on its logical path. /// - /// The type of the requested resource. - /// The logical path of the requested resource. + /// The type of the resource. + /// The logical path of the resource. /// The instance of the resource corresponding to the specified logical path. Task GetResourceAsync(string resourcePath) where T: class; /// /// Gets a resource based on its logical path. /// - /// The type of the requested resource. - /// The logical path of the requested resource. + /// The type of the resource. + /// The logical path of the resource. /// The instance of the resource corresponding to the specified logical path. T GetResource(string resourcePath) where T : class; @@ -45,5 +45,37 @@ public interface IResourceProviderService /// The logical path of the action to be executed. /// The that contains details about the result of the execution. Task ExecuteAction(string actionPath); + + /// + /// Creates or updates a resource based on its logical path. + /// + /// The type of the resource. + /// The logical path of the resource. + /// The instance of the resource being created or updated. + /// + Task UpsertResourceAsync(string resourcePath, T resource) where T : class; + + /// + /// Creates or updates a resource based on its logical path. + /// + /// The type of the resource. + /// The logical path of the resource. + /// The instance of the resource being created or updated. + void UpsertResource(string resourcePath, T resource) where T : class; + + /// + /// Deletes a resource based on its logical path. + /// + /// The type of the resource. + /// The logical path of the resource. + /// + Task DeleteResourceAsync(string resourcePath) where T : class; + + /// + /// Deletes a resource based on its logical path. + /// + /// The type of the resource. + /// The logical path of the resource. + void DeleteResource(string resourcePath) where T : class; } } diff --git a/src/dotnet/Common/Interfaces/IServiceFactory`1.cs b/src/dotnet/Common/Interfaces/IServiceFactory`1.cs index 4fa3fd646e..7602ba7b57 100644 --- a/src/dotnet/Common/Interfaces/IServiceFactory`1.cs +++ b/src/dotnet/Common/Interfaces/IServiceFactory`1.cs @@ -1,4 +1,5 @@ -using System; +using FoundationaLLM.Common.Models.Vectorization; +using System; using System.Collections.Generic; using System.Linq; using System.Text; @@ -9,14 +10,21 @@ namespace FoundationaLLM.Common.Interfaces /// /// Creates typed service instances. /// - public interface IServiceFactory + public interface IVectorizationServiceFactory { /// - /// Creates a service instance of type T specified by name. + /// Retrieves a service instance of type T specified by name. /// /// The name of the service instance to create. /// The service instance created by name. - T CreateService(string serviceName); + T GetService(string serviceName); + + /// + /// Retrieves a service instance of type T specified by name and its associated vectorizaiton profile. + /// + /// The name of the service instance to create. + /// The service instance and its associated vectorization profile. + (T Service, VectorizationProfileBase VectorizationProfile) GetServiceWithProfile(string serviceName); } } diff --git a/src/dotnet/Common/Interfaces/ITextEmbeddingService.cs b/src/dotnet/Common/Interfaces/ITextEmbeddingService.cs new file mode 100644 index 0000000000..0aaed708bc --- /dev/null +++ b/src/dotnet/Common/Interfaces/ITextEmbeddingService.cs @@ -0,0 +1,24 @@ +using FoundationaLLM.Common.Models.TextEmbedding; + +namespace FoundationaLLM.Common.Interfaces +{ + /// + /// Provides text embedding capabilities. + /// + public interface ITextEmbeddingService + { + /// + /// Creates the vector embedding for a specified text. + /// + /// The text which needs to be embedded. + /// Response containing the vector embedding and the amount of tokens used. + Task<(Embedding Embedding, int TokenCount)> GetEmbeddingAsync(string text); + + /// + /// Creates the vector embeddings for a specified list of texts. + /// + /// The list of texts which need to be embedded. + /// Response containing the list of vector embeddings and the amount of tokens used. + Task<(IList Embeddings, int TokenCount)> GetEmbeddingsAsync(IList texts); + } +} diff --git a/src/dotnet/Common/Models/TextEmbedding/ContentIdentifier.cs b/src/dotnet/Common/Models/TextEmbedding/ContentIdentifier.cs new file mode 100644 index 0000000000..0adaadd7f0 --- /dev/null +++ b/src/dotnet/Common/Models/TextEmbedding/ContentIdentifier.cs @@ -0,0 +1,31 @@ +using System.Text.Json.Serialization; + +namespace FoundationaLLM.Common.Models.TextEmbedding; + +/// +/// Represents the content associated with a vectorization request. +/// +public class ContentIdentifier +{ + /// + /// The multipart unique identifier of the the content (i.e. document) being vectorized. + /// + [JsonPropertyOrder(1)] + [JsonPropertyName("multipart_id")] + public required List MultipartId { get; set; } + + /// + /// The unique identifier of the content (i.e., document) being vectorized. + /// The identifier is determined by concatenating the parts from . + /// + [JsonIgnore] + public string UniqueId => string.Join("/", MultipartId); + + /// + /// The canonical identifier of the content being vectorized. + /// Vectorization state services use it to derive the location of the state in the underlying storage. + /// + [JsonPropertyOrder(2)] + [JsonPropertyName("canonical_id")] + public required string CanonicalId { get; set; } +} diff --git a/src/dotnet/Common/Models/TextEmbedding/EmbeddedContent.cs b/src/dotnet/Common/Models/TextEmbedding/EmbeddedContent.cs new file mode 100644 index 0000000000..a7c20231a6 --- /dev/null +++ b/src/dotnet/Common/Models/TextEmbedding/EmbeddedContent.cs @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Common.Models.TextEmbedding +{ + /// + /// Provides information about embedded content. + /// + public class EmbeddedContent + { + /// + /// The canonical identifier of the content. + /// + public required ContentIdentifier ContentId { get; set; } + + /// + /// The name of the content source profile used to retrieve content. + /// + public required string ContentSourceProfileName { get; set; } + + /// + /// The list of conent + /// + public required List ContentParts { get; set; } = []; + } +} diff --git a/src/dotnet/Common/Models/TextEmbedding/EmbeddedContentPart.cs b/src/dotnet/Common/Models/TextEmbedding/EmbeddedContentPart.cs new file mode 100644 index 0000000000..2d7e7054c8 --- /dev/null +++ b/src/dotnet/Common/Models/TextEmbedding/EmbeddedContentPart.cs @@ -0,0 +1,24 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Common.Models.TextEmbedding +{ + /// + /// Provides information about an embedded content part. + /// + public class EmbeddedContentPart + { + /// + /// The text content that was embedded. + /// + public required string Content { get; set; } + + /// + /// The vector embedding associated with the content. + /// + public required Embedding Embedding { get; set; } + } +} diff --git a/src/dotnet/Common/Models/TextEmbedding/Embedding.cs b/src/dotnet/Common/Models/TextEmbedding/Embedding.cs new file mode 100644 index 0000000000..81cca4b2af --- /dev/null +++ b/src/dotnet/Common/Models/TextEmbedding/Embedding.cs @@ -0,0 +1,102 @@ +using System.Runtime.InteropServices; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace FoundationaLLM.Common.Models.TextEmbedding +{ + /// + /// Stores a vector embedding. + /// This type should be serialized using Emedding.JsonConverter. + /// + public struct Embedding : IEquatable + { + /// + /// The vector that represents the embedding. + /// This property is only serialized when Embedding.JsonConverter is used. + /// + [JsonIgnore] + public ReadOnlyMemory Vector { get; set; } = new(); + + /// + /// Length of the vector representing the embedding. + /// This property is only serialized when Embedding.JsonConverter is used. + /// + [JsonIgnore] + public readonly int Length => this.Vector.Length; + + /// + /// Creates an embedding from a vector represented as an array of real numbers. + /// + /// The array containing the vector values. + public Embedding(float[] vector) => this.Vector = vector; + + /// + /// Creates an embedding from a vector represents as a object. + /// + /// + public Embedding(ReadOnlyMemory vector) => this.Vector = vector; + + /// + /// Creates an embedding with a zero-initialzed vector of a specified size. + /// + /// The size of the vector representing the embedding. + public Embedding(int size) => this.Vector = new ReadOnlyMemory(new float[size]); + + /// + public readonly bool Equals(Embedding other) => this.Vector.Equals(other.Vector); + + /// + /// Inidicates whether the current object is equal to another object. + /// + /// An object to compare with this object. + /// True if the object is equal to the obj param and False otherwise. + public override readonly bool Equals(object? obj) => (obj is Embedding other && this.Equals(other)); + + /// + /// Checks if two values are equal. + /// + /// The first value to be cheched. + /// The second value to be checked. + /// True if the two values are equal, False otherwise. + public static bool operator ==(Embedding v1, Embedding v2) => v1.Equals(v2); + + /// + /// Checks if two values are different. + /// + /// The first value to be cheched. + /// The second value to be checked. + /// True if the two values are different, False otherwise. + public static bool operator !=(Embedding v1, Embedding v2) => !(v1 == v2); + + /// + /// Calculated the hashcode for this . + /// + /// The hash value represented by an integer. + public override readonly int GetHashCode() => this.Vector.GetHashCode(); + + /// + /// Serializes the content of an value. + /// Note: use Embedding.JsonConverter to serialize objects using + /// the Embedding type, for example: + /// [JsonPropertyName("vector")] + /// [JsonConverter(typeof(Embedding.JsonConverter))] + /// public Embedding Vector { get; set; } + /// + public sealed class JsonConverter : JsonConverter + { + /// An instance of a converter for float[] that all operations delegate to + private static readonly JsonConverter Converter = + (JsonConverter)new JsonSerializerOptions().GetConverter(typeof(float[])); + + /// + public override Embedding Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) => + new(Converter.Read(ref reader, typeof(float[]), options) ?? []); + + /// + public override void Write(Utf8JsonWriter writer, Embedding value, JsonSerializerOptions options) => + Converter.Write(writer, MemoryMarshal.TryGetArray(value.Vector, out ArraySegment array) && array.Count == value.Length + ? array.Array! + : value.Vector.ToArray(), options); + } + } +} diff --git a/src/dotnet/Common/Models/Vectorization/VectorizationProfileBase.cs b/src/dotnet/Common/Models/Vectorization/VectorizationProfileBase.cs new file mode 100644 index 0000000000..defb48ad7f --- /dev/null +++ b/src/dotnet/Common/Models/Vectorization/VectorizationProfileBase.cs @@ -0,0 +1,18 @@ +namespace FoundationaLLM.Common.Models.Vectorization +{ + /// + /// Basic properties for vectorization profiles. + /// + public class VectorizationProfileBase + { + /// + /// The name of the vectorization profile. + /// + public required string Name { get; set; } + + /// + /// The configuration associated with the vectorization profile. + /// + public Dictionary? Settings { get; set; } = []; + } +} diff --git a/src/dotnet/Common/Services/ResourceProviders/ResourceProviderServiceBase.cs b/src/dotnet/Common/Services/ResourceProviders/ResourceProviderServiceBase.cs index ca45c18b3f..bd86589318 100644 --- a/src/dotnet/Common/Services/ResourceProviders/ResourceProviderServiceBase.cs +++ b/src/dotnet/Common/Services/ResourceProviders/ResourceProviderServiceBase.cs @@ -94,6 +94,42 @@ public async Task GetResourceAsync(string resourcePath) where T : class return await GetResourceAsyncInternal(instances); } + /// + public async Task UpsertResourceAsync(string resourcePath, T resource) where T : class + { + if (!_isInitialized) + throw new ResourceProviderException($"The resource provider {_name} is not initialized."); + var instances = GetResourceInstancesFromPath(resourcePath); + await UpsertResourceAsync(resourcePath, resource); + } + + /// + public void UpsertResource(string resourcePath, T resource) where T : class + { + if (!_isInitialized) + throw new ResourceProviderException($"The resource provider {_name} is not initialized."); + var instances = GetResourceInstancesFromPath(resourcePath); + UpsertResource(instances, resource); + } + + /// + public async Task DeleteResourceAsync(string resourcePath) where T : class + { + if (!_isInitialized) + throw new ResourceProviderException($"The resource provider {_name} is not initialized."); + var instances = GetResourceInstancesFromPath(resourcePath); + await DeleteResourceAsync(instances); + } + + /// + public void DeleteResource(string resourcePath) where T : class + { + if (!_isInitialized) + throw new ResourceProviderException($"The resource provider {_name} is not initialized."); + var instances = GetResourceInstancesFromPath(resourcePath); + DeleteResource(instances); + } + /// /// The internal implementation of Initialize. Must be overridden in derived classes. /// @@ -134,6 +170,46 @@ protected virtual async Task GetResourceAsyncInternal(List + /// The internal implementation of UpsertResource. Must be overridden in derived classes. + /// + /// The list of objects parsed from the resource path. + /// The instance of the resource being created or updated. + /// + protected virtual void UpsertResource(List instances, T resource) => + throw new NotImplementedException(); + + /// + /// The internal implementation of UpsertResourceAsync. Must be overridden in derived classes. + /// + /// The list of objects parsed from the resource path. + /// The instance of the resource being created or updated. + /// + protected virtual async Task UpsertResourceAsync(List instances, T resource) + { + await Task.CompletedTask; + throw new NotImplementedException(); + } + + /// + /// The internal implementation of DeleteResource. Must be overridden in derived classes. + /// + /// The list of objects parsed from the resource path. + /// + protected virtual void DeleteResource(List instances) => + throw new NotImplementedException(); + + /// + /// The internal implementation of DeleteResourceAsync. Must be overridden in derived classes. + /// + /// The list of objects parsed from the resource path. + /// + protected virtual async Task DeleteResourceAsync(List instances) + { + await Task.CompletedTask; + throw new NotImplementedException(); + } + private List GetResourceInstancesFromPath(string resourcePath) { if (string.IsNullOrWhiteSpace(resourcePath)) diff --git a/src/dotnet/Common/Services/TextSplitters/TokenTextSplitterService.cs b/src/dotnet/Common/Services/TextSplitters/TokenTextSplitterService.cs index 48e2b84fd8..7a8d5d0e55 100644 --- a/src/dotnet/Common/Services/TextSplitters/TokenTextSplitterService.cs +++ b/src/dotnet/Common/Services/TextSplitters/TokenTextSplitterService.cs @@ -5,74 +5,77 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; +namespace FoundationaLLM.Common.Services.TextSplitters +{ -/// -/// Splits text based on number of tokens. -/// -/// The used to tokenize the input text. -/// The providing the settings for the service. -/// The logger used for logging. -public class TokenTextSplitterService( + /// + /// Splits text based on number of tokens. + /// + /// The used to tokenize the input text. + /// The providing the settings for the service. + /// The logger used for logging. + public class TokenTextSplitterService( ITokenizerService tokenizerService, IOptions options, ILogger logger) : ITextSplitterService -{ - private readonly ITokenizerService _tokenizerService = tokenizerService; - private readonly TokenTextSplitterServiceSettings _settings = options.Value; - private readonly ILogger _logger = logger; - - /// - public (List TextChunks, string Message) SplitPlainText(string text) { - var tokens = _tokenizerService.Encode(text, _settings.TokenizerEncoder); + private readonly ITokenizerService _tokenizerService = tokenizerService; + private readonly TokenTextSplitterServiceSettings _settings = options.Value; + private readonly ILogger _logger = logger; - if (tokens != null) + /// + public (List TextChunks, string Message) SplitPlainText(string text) { - _logger.LogInformation("The tokenizer identified {TokensCount} tokens.", tokens.Count); + var tokens = _tokenizerService.Encode(text, _settings.TokenizerEncoder); - var chunksCount = (int) Math.Ceiling((1f * tokens!.Count - _settings.OverlapSizeTokens) / (_settings.ChunkSizeTokens - _settings.OverlapSizeTokens)); + if (tokens != null) + { + _logger.LogInformation("The tokenizer identified {TokensCount} tokens.", tokens.Count); - var chunks = Enumerable.Range(0, chunksCount - 1) - .Select(i => tokens.Skip(i * (_settings.ChunkSizeTokens - _settings.OverlapSizeTokens)).Take(_settings.ChunkSizeTokens).ToArray()) - .Select(t => _tokenizerService.Decode(t, _settings.TokenizerEncoder)) - .ToList(); + var chunksCount = (int)Math.Ceiling((1f * tokens!.Count - _settings.OverlapSizeTokens) / (_settings.ChunkSizeTokens - _settings.OverlapSizeTokens)); - var lastChunkStart = (chunksCount - 1) * _settings.ChunkSizeTokens; - var lastChunkSize = tokens.Count - lastChunkStart + 1; - var resultMessage = string.Empty; + var chunks = Enumerable.Range(0, chunksCount - 1) + .Select(i => tokens.Skip(i * (_settings.ChunkSizeTokens - _settings.OverlapSizeTokens)).Take(_settings.ChunkSizeTokens).ToArray()) + .Select(t => _tokenizerService.Decode(t, _settings.TokenizerEncoder)) + .ToList(); - if (lastChunkSize < 2 * _settings.OverlapSizeTokens) - { - // The last chunk is to small, will just incorporate it into the second to last. - var secondToLastChunkStart = (chunksCount - 2) * _settings.ChunkSizeTokens; - var newLastChunkSize = tokens.Count - secondToLastChunkStart + 1; - var newLastChunk = _tokenizerService.Decode( - tokens - .Skip(secondToLastChunkStart) - .Take(newLastChunkSize) - .ToArray(), - _settings.TokenizerEncoder); - chunks.RemoveAt(chunks.Count - 1); - chunks.Add(newLastChunk); + var lastChunkStart = (chunksCount - 1) * _settings.ChunkSizeTokens; + var lastChunkSize = tokens.Count - lastChunkStart + 1; + var resultMessage = string.Empty; - resultMessage = $"The number of text chunks is {chunks.Count}. The size of the last chunk is {newLastChunkSize} tokens."; - } - else - { - var lastChunk = _tokenizerService.Decode( - tokens - .Skip(lastChunkStart) - .Take(lastChunkSize) - .ToArray(), - _settings.TokenizerEncoder); - chunks.Add(lastChunk); + if (lastChunkSize < 2 * _settings.OverlapSizeTokens) + { + // The last chunk is to small, will just incorporate it into the second to last. + var secondToLastChunkStart = (chunksCount - 2) * _settings.ChunkSizeTokens; + var newLastChunkSize = tokens.Count - secondToLastChunkStart + 1; + var newLastChunk = _tokenizerService.Decode( + tokens + .Skip(secondToLastChunkStart) + .Take(newLastChunkSize) + .ToArray(), + _settings.TokenizerEncoder); + chunks.RemoveAt(chunks.Count - 1); + chunks.Add(newLastChunk); - resultMessage = $"The number of text chunks is {chunks.Count}. The size of the last chunk is {lastChunkSize} tokens."; - } + resultMessage = $"The number of text chunks is {chunks.Count}. The size of the last chunk is {newLastChunkSize} tokens."; + } + else + { + var lastChunk = _tokenizerService.Decode( + tokens + .Skip(lastChunkStart) + .Take(lastChunkSize) + .ToArray(), + _settings.TokenizerEncoder); + chunks.Add(lastChunk); - return new (chunks, resultMessage); + resultMessage = $"The number of text chunks is {chunks.Count}. The size of the last chunk is {lastChunkSize} tokens."; + } + + return new(chunks, resultMessage); + } + else + throw new TextProcessingException("The tokenizer service failed to split the text into tokens."); } - else - throw new TextProcessingException("The tokenizer service failed to split the text into tokens."); } } diff --git a/src/dotnet/Common/Settings/AzureAISearchAuthenticationTypes.cs b/src/dotnet/Common/Settings/AzureAISearchAuthenticationTypes.cs new file mode 100644 index 0000000000..e65ec4d55e --- /dev/null +++ b/src/dotnet/Common/Settings/AzureAISearchAuthenticationTypes.cs @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Common.Settings +{ + /// + /// Types of authentication for Azure AI Search. + /// + public enum AzureAISearchAuthenticationTypes + { + /// + /// Unknown authentication type. + /// + Unknown = -1, + + /// + /// Azure managed identity authentication type. + /// + AzureIdentity, + + /// + /// API key authentication type. + /// + APIKey + } +} diff --git a/src/dotnet/Common/Settings/AzureOpenAIAuthenticationTypes.cs b/src/dotnet/Common/Settings/AzureOpenAIAuthenticationTypes.cs new file mode 100644 index 0000000000..a9fbe88e5b --- /dev/null +++ b/src/dotnet/Common/Settings/AzureOpenAIAuthenticationTypes.cs @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Common.Settings +{ + /// + /// Types of authentication for Azure Open AI. + /// + public enum AzureOpenAIAuthenticationTypes + { + /// + /// Unknown authentication type. + /// + Unknown = -1, + + /// + /// Azure managed identity authentication type. + /// + AzureIdentity, + + /// + /// API key authentication type. + /// + APIKey + } +} diff --git a/src/dotnet/SemanticKernel/Chat/ChatBuilder.cs b/src/dotnet/SemanticKernel-obsolete/Chat/ChatBuilder.cs similarity index 100% rename from src/dotnet/SemanticKernel/Chat/ChatBuilder.cs rename to src/dotnet/SemanticKernel-obsolete/Chat/ChatBuilder.cs diff --git a/src/dotnet/SemanticKernel/Chat/PromptOptimizationSettings.cs b/src/dotnet/SemanticKernel-obsolete/Chat/PromptOptimizationSettings.cs similarity index 100% rename from src/dotnet/SemanticKernel/Chat/PromptOptimizationSettings.cs rename to src/dotnet/SemanticKernel-obsolete/Chat/PromptOptimizationSettings.cs diff --git a/src/dotnet/SemanticKernel/Interfaces/IMemorySource.cs b/src/dotnet/SemanticKernel-obsolete/Interfaces/IMemorySource.cs similarity index 100% rename from src/dotnet/SemanticKernel/Interfaces/IMemorySource.cs rename to src/dotnet/SemanticKernel-obsolete/Interfaces/IMemorySource.cs diff --git a/src/dotnet/SemanticKernel/Interfaces/ISemanticKernelService.cs b/src/dotnet/SemanticKernel-obsolete/Interfaces/ISemanticKernelService.cs similarity index 100% rename from src/dotnet/SemanticKernel/Interfaces/ISemanticKernelService.cs rename to src/dotnet/SemanticKernel-obsolete/Interfaces/ISemanticKernelService.cs diff --git a/src/dotnet/SemanticKernel/Interfaces/ISystemPromptService.cs b/src/dotnet/SemanticKernel-obsolete/Interfaces/ISystemPromptService.cs similarity index 100% rename from src/dotnet/SemanticKernel/Interfaces/ISystemPromptService.cs rename to src/dotnet/SemanticKernel-obsolete/Interfaces/ISystemPromptService.cs diff --git a/src/dotnet/SemanticKernel/Interfaces/ITokenizer.cs b/src/dotnet/SemanticKernel-obsolete/Interfaces/ITokenizer.cs similarity index 100% rename from src/dotnet/SemanticKernel/Interfaces/ITokenizer.cs rename to src/dotnet/SemanticKernel-obsolete/Interfaces/ITokenizer.cs diff --git a/src/dotnet/SemanticKernel/MemorySource/AzureCognitiveSearchMemorySource.cs b/src/dotnet/SemanticKernel-obsolete/MemorySource/AzureCognitiveSearchMemorySource.cs similarity index 100% rename from src/dotnet/SemanticKernel/MemorySource/AzureCognitiveSearchMemorySource.cs rename to src/dotnet/SemanticKernel-obsolete/MemorySource/AzureCognitiveSearchMemorySource.cs diff --git a/src/dotnet/SemanticKernel/MemorySource/AzureCognitiveSearchMemorySourceConfig.cs b/src/dotnet/SemanticKernel-obsolete/MemorySource/AzureCognitiveSearchMemorySourceConfig.cs similarity index 100% rename from src/dotnet/SemanticKernel/MemorySource/AzureCognitiveSearchMemorySourceConfig.cs rename to src/dotnet/SemanticKernel-obsolete/MemorySource/AzureCognitiveSearchMemorySourceConfig.cs diff --git a/src/dotnet/SemanticKernel/MemorySource/BlobStorageMemorySource.cs b/src/dotnet/SemanticKernel-obsolete/MemorySource/BlobStorageMemorySource.cs similarity index 100% rename from src/dotnet/SemanticKernel/MemorySource/BlobStorageMemorySource.cs rename to src/dotnet/SemanticKernel-obsolete/MemorySource/BlobStorageMemorySource.cs diff --git a/src/dotnet/SemanticKernel/MemorySource/BlobStorageMemorySourceConfig.cs b/src/dotnet/SemanticKernel-obsolete/MemorySource/BlobStorageMemorySourceConfig.cs similarity index 100% rename from src/dotnet/SemanticKernel/MemorySource/BlobStorageMemorySourceConfig.cs rename to src/dotnet/SemanticKernel-obsolete/MemorySource/BlobStorageMemorySourceConfig.cs diff --git a/src/dotnet/SemanticKernel/Models/ConfigurationOptions/AzureCognitiveSearchMemorySourceSettings.cs b/src/dotnet/SemanticKernel-obsolete/Models/ConfigurationOptions/AzureCognitiveSearchMemorySourceSettings.cs similarity index 100% rename from src/dotnet/SemanticKernel/Models/ConfigurationOptions/AzureCognitiveSearchMemorySourceSettings.cs rename to src/dotnet/SemanticKernel-obsolete/Models/ConfigurationOptions/AzureCognitiveSearchMemorySourceSettings.cs diff --git a/src/dotnet/SemanticKernel/Models/ConfigurationOptions/BlobStorageMemorySourceSettings.cs b/src/dotnet/SemanticKernel-obsolete/Models/ConfigurationOptions/BlobStorageMemorySourceSettings.cs similarity index 100% rename from src/dotnet/SemanticKernel/Models/ConfigurationOptions/BlobStorageMemorySourceSettings.cs rename to src/dotnet/SemanticKernel-obsolete/Models/ConfigurationOptions/BlobStorageMemorySourceSettings.cs diff --git a/src/dotnet/SemanticKernel/Models/ConfigurationOptions/DurableSystemPromptServiceSettings.cs b/src/dotnet/SemanticKernel-obsolete/Models/ConfigurationOptions/DurableSystemPromptServiceSettings.cs similarity index 100% rename from src/dotnet/SemanticKernel/Models/ConfigurationOptions/DurableSystemPromptServiceSettings.cs rename to src/dotnet/SemanticKernel-obsolete/Models/ConfigurationOptions/DurableSystemPromptServiceSettings.cs diff --git a/src/dotnet/SemanticKernel/Models/ConfigurationOptions/SemanticKernelServiceSettings.cs b/src/dotnet/SemanticKernel-obsolete/Models/ConfigurationOptions/SemanticKernelServiceSettings.cs similarity index 100% rename from src/dotnet/SemanticKernel/Models/ConfigurationOptions/SemanticKernelServiceSettings.cs rename to src/dotnet/SemanticKernel-obsolete/Models/ConfigurationOptions/SemanticKernelServiceSettings.cs diff --git a/src/dotnet/SemanticKernel/Plugins/Core/TextEmbeddingObjectMemoryPlugin.cs b/src/dotnet/SemanticKernel-obsolete/Plugins/Core/TextEmbeddingObjectMemoryPlugin.cs similarity index 100% rename from src/dotnet/SemanticKernel/Plugins/Core/TextEmbeddingObjectMemoryPlugin.cs rename to src/dotnet/SemanticKernel-obsolete/Plugins/Core/TextEmbeddingObjectMemoryPlugin.cs diff --git a/src/dotnet/SemanticKernel/Plugins/Core/TextSummaryPlugin.cs b/src/dotnet/SemanticKernel-obsolete/Plugins/Core/TextSummaryPlugin.cs similarity index 100% rename from src/dotnet/SemanticKernel/Plugins/Core/TextSummaryPlugin.cs rename to src/dotnet/SemanticKernel-obsolete/Plugins/Core/TextSummaryPlugin.cs diff --git a/src/dotnet/SemanticKernel/Plugins/Memory/VectorMemoryStore.cs b/src/dotnet/SemanticKernel-obsolete/Plugins/Memory/VectorMemoryStore.cs similarity index 100% rename from src/dotnet/SemanticKernel/Plugins/Memory/VectorMemoryStore.cs rename to src/dotnet/SemanticKernel-obsolete/Plugins/Memory/VectorMemoryStore.cs diff --git a/src/dotnet/SemanticKernel-obsolete/SemanticKernel-obsolete.csproj b/src/dotnet/SemanticKernel-obsolete/SemanticKernel-obsolete.csproj new file mode 100644 index 0000000000..7279918af0 --- /dev/null +++ b/src/dotnet/SemanticKernel-obsolete/SemanticKernel-obsolete.csproj @@ -0,0 +1,29 @@ + + + + net8.0 + enable + enable + FoundationaLLM.SemanticKernel.Core + FoundationaLLM.SemanticKernel.Core + True + + + + + + + + + + + + + + + + + + + + diff --git a/src/dotnet/SemanticKernel/Services/DurableSystemPromptService.cs b/src/dotnet/SemanticKernel-obsolete/Services/DurableSystemPromptService.cs similarity index 100% rename from src/dotnet/SemanticKernel/Services/DurableSystemPromptService.cs rename to src/dotnet/SemanticKernel-obsolete/Services/DurableSystemPromptService.cs diff --git a/src/dotnet/SemanticKernel/Services/InMemorySystemPromptService.cs b/src/dotnet/SemanticKernel-obsolete/Services/InMemorySystemPromptService.cs similarity index 100% rename from src/dotnet/SemanticKernel/Services/InMemorySystemPromptService.cs rename to src/dotnet/SemanticKernel-obsolete/Services/InMemorySystemPromptService.cs diff --git a/src/dotnet/SemanticKernel/Services/SemanticKernelService.cs b/src/dotnet/SemanticKernel-obsolete/Services/SemanticKernelService.cs similarity index 100% rename from src/dotnet/SemanticKernel/Services/SemanticKernelService.cs rename to src/dotnet/SemanticKernel-obsolete/Services/SemanticKernelService.cs diff --git a/src/dotnet/SemanticKernel/Services/SemanticKernelTokenizer.cs b/src/dotnet/SemanticKernel-obsolete/Services/SemanticKernelTokenizer.cs similarity index 100% rename from src/dotnet/SemanticKernel/Services/SemanticKernelTokenizer.cs rename to src/dotnet/SemanticKernel-obsolete/Services/SemanticKernelTokenizer.cs diff --git a/src/dotnet/SemanticKernel/Text/StringExtensions.cs b/src/dotnet/SemanticKernel-obsolete/Text/StringExtensions.cs similarity index 100% rename from src/dotnet/SemanticKernel/Text/StringExtensions.cs rename to src/dotnet/SemanticKernel-obsolete/Text/StringExtensions.cs diff --git a/src/dotnet/SemanticKernel/TextEmbedding/EmbeddingUtility.cs b/src/dotnet/SemanticKernel-obsolete/TextEmbedding/EmbeddingUtility.cs similarity index 100% rename from src/dotnet/SemanticKernel/TextEmbedding/EmbeddingUtility.cs rename to src/dotnet/SemanticKernel-obsolete/TextEmbedding/EmbeddingUtility.cs diff --git a/src/dotnet/SemanticKernel/nuget.config b/src/dotnet/SemanticKernel-obsolete/nuget.config similarity index 100% rename from src/dotnet/SemanticKernel/nuget.config rename to src/dotnet/SemanticKernel-obsolete/nuget.config diff --git a/src/dotnet/SemanticKernel/Models/Configuration/AzureAISearchIndexingServiceSettings.cs b/src/dotnet/SemanticKernel/Models/Configuration/AzureAISearchIndexingServiceSettings.cs new file mode 100644 index 0000000000..bb65090224 --- /dev/null +++ b/src/dotnet/SemanticKernel/Models/Configuration/AzureAISearchIndexingServiceSettings.cs @@ -0,0 +1,32 @@ +using FoundationaLLM.Common.Settings; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.Json.Serialization; +using System.Threading.Tasks; + +namespace FoundationaLLM.SemanticKernel.Core.Models.Configuration +{ + /// + /// Provides configuration settings for the Azure AI Search indexing service. + /// + public record AzureAISearchIndexingServiceSettings + { + /// + /// The endpoint of the Azure AI deployment. + /// + public required string Endpoint { get; set; } + + /// + /// The API key used to connect to the Azure AI Search endpoint. Valid only if AuthenticationType is APIKey. + /// + public string? APIKey { get; set; } + + /// + /// The indicating which authentication mechanism to use. + /// + [JsonConverter(typeof(JsonStringEnumConverter))] + public required AzureAISearchAuthenticationTypes AuthenticationType { get; set; } + } +} diff --git a/src/dotnet/SemanticKernel/Models/Configuration/SemanticKernelTextEmbeddingServiceSettings.cs b/src/dotnet/SemanticKernel/Models/Configuration/SemanticKernelTextEmbeddingServiceSettings.cs new file mode 100644 index 0000000000..538a54da34 --- /dev/null +++ b/src/dotnet/SemanticKernel/Models/Configuration/SemanticKernelTextEmbeddingServiceSettings.cs @@ -0,0 +1,32 @@ +using FoundationaLLM.Common.Settings; +using System.Text.Json.Serialization; + +namespace FoundationaLLM.SemanticKernel.Core.Models.Configuration +{ + /// + /// Provides configuration settings for the service. + /// + public record SemanticKernelTextEmbeddingServiceSettings + { + /// + /// The name of the Azure Open AI deployment. + /// + public required string DeploymentName { get; set; } + + /// + /// The endpoint of the Azure Open AI deployment. + /// + public required string Endpoint { get; set; } + + /// + /// The API key used to connect to the Azure Open AI endpoint. Valid only if AuthenticationType is APIKey. + /// + public string? APIKey { get; set; } + + /// + /// The indicating which authentication mechanism to use. + /// + [JsonConverter(typeof(JsonStringEnumConverter))] + public required AzureOpenAIAuthenticationTypes AuthenticationType { get; set; } + } +} diff --git a/src/dotnet/SemanticKernel/SemanticKernel.csproj b/src/dotnet/SemanticKernel/SemanticKernel.csproj index 7279918af0..3b2f01cb40 100644 --- a/src/dotnet/SemanticKernel/SemanticKernel.csproj +++ b/src/dotnet/SemanticKernel/SemanticKernel.csproj @@ -6,24 +6,16 @@ enable FoundationaLLM.SemanticKernel.Core FoundationaLLM.SemanticKernel.Core - True + True - - - - - - - - - - - - - + + + + + - + diff --git a/src/dotnet/SemanticKernel/Services/AzureAISearchIndexingService.cs b/src/dotnet/SemanticKernel/Services/AzureAISearchIndexingService.cs new file mode 100644 index 0000000000..3411b15b8f --- /dev/null +++ b/src/dotnet/SemanticKernel/Services/AzureAISearchIndexingService.cs @@ -0,0 +1,117 @@ +using Azure.Identity; +using FoundationaLLM.Common.Exceptions; +using FoundationaLLM.Common.Interfaces; +using FoundationaLLM.Common.Models.TextEmbedding; +using FoundationaLLM.Common.Settings; +using FoundationaLLM.SemanticKernel.Core.Models.Configuration; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.Connectors.AzureAISearch; +using Microsoft.SemanticKernel.Embeddings; +using Microsoft.SemanticKernel.Memory; +using System.ComponentModel; + +#pragma warning disable SKEXP0003, SKEXP0021 + +namespace FoundationaLLM.SemanticKernel.Core.Services +{ + /// + /// Provides vector embedding indexing based on Azure AI Search. + /// + public class AzureAISearchIndexingService : IIndexingService + { + private readonly AzureAISearchIndexingServiceSettings _settings; + private readonly ILogger _logger; + private readonly AzureAISearchMemoryStore _memoryStore; + + /// + /// Creates a new instance. + /// + /// The providing configuration settings. + /// The used for logging. + public AzureAISearchIndexingService( + IOptions options, + ILogger logger) + { + _settings = options.Value; + _logger = logger; + _memoryStore = CreateMemoryStore(); + } + + /// + public async Task> IndexEmbeddingsAsync(EmbeddedContent embeddedContent, string indexName) + { + var indexIds = new List(); + + await foreach (var id in _memoryStore.UpsertBatchAsync( + indexName, + embeddedContent.ContentParts.Select(cp => new MemoryRecord( + new MemoryRecordMetadata( + true, + embeddedContent.ContentId.UniqueId, + cp.Content, + "Generated by FoundationaLLM.", + embeddedContent.ContentSourceProfileName, + string.Empty), + cp.Embedding.Vector, + embeddedContent.ContentId.UniqueId, + DateTimeOffset.UtcNow)))) + { + indexIds.Add(id); + } + + return indexIds; + } + + /// + /// Creates an instance using the endpoint and the API key. + /// + /// The endpoint of the Azure AI Search deployment. + /// The API key used to connect to the Azure AI Search deployment. + /// The instance. + private AzureAISearchMemoryStore CreateMemoryStoreFromAPIKey(string endpoint, string apiKey) => + new AzureAISearchMemoryStore(endpoint, apiKey); + + /// + /// Creates an instance using the endpoint and the Azure identity. + /// + /// The endpoint of the Azure AI Search deployment. + /// The instance. + private AzureAISearchMemoryStore CreateMemoryStoreFromIdentity(string endpoint) => + new AzureAISearchMemoryStore(endpoint, new DefaultAzureCredential()); + + private void ValidateEndpoint(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + { + _logger.LogCritical("The Azure AI Search endpoint is invalid."); + throw new ConfigurationValueException("The Azure AI Search endpoint is invalid."); + } + } + private void ValidateAPIKey(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + { + _logger.LogCritical("The Azure AI Search API key is invalid."); + throw new ConfigurationValueException("The Azure AI Search API key is invalid."); + } + } + + private AzureAISearchMemoryStore CreateMemoryStore() + { + switch (_settings.AuthenticationType) + { + case AzureAISearchAuthenticationTypes.APIKey: + ValidateEndpoint(_settings.Endpoint); + ValidateAPIKey(_settings.APIKey); + return CreateMemoryStoreFromAPIKey(_settings.Endpoint, _settings.APIKey!); + case AzureAISearchAuthenticationTypes.AzureIdentity: + ValidateEndpoint(_settings.Endpoint); + return CreateMemoryStoreFromIdentity(_settings.Endpoint); + default: + throw new InvalidEnumArgumentException($"The authentication type {_settings.AuthenticationType} is not supported."); + } + } + } +} diff --git a/src/dotnet/SemanticKernel/Services/SemanticKernelTextEmbeddingService.cs b/src/dotnet/SemanticKernel/Services/SemanticKernelTextEmbeddingService.cs new file mode 100644 index 0000000000..1ab4983d68 --- /dev/null +++ b/src/dotnet/SemanticKernel/Services/SemanticKernelTextEmbeddingService.cs @@ -0,0 +1,127 @@ +using Azure.Identity; +using FoundationaLLM.Common.Exceptions; +using FoundationaLLM.Common.Interfaces; +using FoundationaLLM.Common.Models.TextEmbedding; +using FoundationaLLM.Common.Settings; +using FoundationaLLM.SemanticKernel.Core.Models.Configuration; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.Embeddings; +using System.ComponentModel; + +#pragma warning disable SKEXP0001, SKEXP0011 + +namespace FoundationaLLM.SemanticKernel.Core.Services +{ + /// + /// Generates text embeddings using the Semantic Kernel orchestrator. + /// + public class SemanticKernelTextEmbeddingService : ITextEmbeddingService + { + private readonly SemanticKernelTextEmbeddingServiceSettings _settings; + private readonly ILogger _logger; + private readonly Kernel _kernel; + private readonly ITextEmbeddingGenerationService _textEmbeddingService; + + /// + /// Creates a new instance. + /// + /// The providing configuration settings. + /// The used for logging. + public SemanticKernelTextEmbeddingService( + IOptions options, + ILogger logger) + { + _settings = options.Value; + _logger = logger; + _kernel = CreateKernel(); + _textEmbeddingService = _kernel.GetRequiredService(); + } + + /// + public async Task<(Embedding Embedding, int TokenCount)> GetEmbeddingAsync(string text) + { + var embedding = await _textEmbeddingService.GenerateEmbeddingAsync(text); + return new(new(embedding), 0); + } + + /// + public async Task<(IList Embeddings, int TokenCount)> GetEmbeddingsAsync(IList texts) + { + var embeddings = await _textEmbeddingService.GenerateEmbeddingsAsync(texts); + return new(embeddings.Select(e => new Embedding(e)).ToList(), 0); + } + + /// + /// Creates a instance using the deployment name, endpoint, and API key. + /// + /// The name of the Azure Open AI deployment. + /// The endpoint of the Azure Open AI deployment. + /// The API key used to connect to the Azure Open AI deployment. + /// The instance. + private Kernel CreateKernelFromAPIKey(string deploymentName, string endpoint, string apiKey) + { + var builder = Kernel.CreateBuilder(); + builder.AddAzureOpenAITextEmbeddingGeneration(deploymentName, endpoint, apiKey); + return builder.Build(); + } + + /// + /// Creates a instance using the deployment name, endpoint, and the Azure identity. + /// + /// The name of the Azure Open AI deployment. + /// The endpoint of the Azure Open AI deployment. + /// The instance. + private Kernel CreateKernelFromIdentity(string deploymentName, string endpoint) + { + var builder = Kernel.CreateBuilder(); + builder.AddAzureOpenAITextEmbeddingGeneration(deploymentName, endpoint, new DefaultAzureCredential()); + return builder.Build(); + } + + private void ValidateDeploymentName(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + { + _logger.LogCritical("The Azure Open AI deployment name is invalid."); + throw new ConfigurationValueException("The Azure Open AI deployment name is invalid."); + } + } + + private void ValidateEndpoint(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + { + _logger.LogCritical("The Azure Open AI endpoint is invalid."); + throw new ConfigurationValueException("The Azure Open AI endpoint is invalid."); + } + } + private void ValidateAPIKey(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + { + _logger.LogCritical("The Azure Open AI API key is invalid."); + throw new ConfigurationValueException("The Azure Open AI API key is invalid."); + } + } + + private Kernel CreateKernel() + { + switch (_settings.AuthenticationType) + { + case AzureOpenAIAuthenticationTypes.APIKey: + ValidateDeploymentName(_settings.DeploymentName); + ValidateEndpoint(_settings.Endpoint); + ValidateAPIKey(_settings.APIKey); + return CreateKernelFromAPIKey(_settings.DeploymentName, _settings.Endpoint, _settings.APIKey!); + case AzureOpenAIAuthenticationTypes.AzureIdentity: + ValidateDeploymentName(_settings.DeploymentName); + ValidateEndpoint(_settings.Endpoint); + return CreateKernelFromIdentity(_settings.DeploymentName, _settings.Endpoint); + default: + throw new InvalidEnumArgumentException($"The authentication type {_settings.AuthenticationType} is not supported."); + } + } + } +} diff --git a/src/dotnet/SemanticKernelAPI/SemanticKernelAPI.csproj b/src/dotnet/SemanticKernelAPI/SemanticKernelAPI.csproj index 389c777f82..b123838bae 100644 --- a/src/dotnet/SemanticKernelAPI/SemanticKernelAPI.csproj +++ b/src/dotnet/SemanticKernelAPI/SemanticKernelAPI.csproj @@ -32,7 +32,7 @@ - + diff --git a/src/dotnet/Vectorization/Handlers/EmbeddingHandler.cs b/src/dotnet/Vectorization/Handlers/EmbeddingHandler.cs index d81dc09343..a51aac61dc 100644 --- a/src/dotnet/Vectorization/Handlers/EmbeddingHandler.cs +++ b/src/dotnet/Vectorization/Handlers/EmbeddingHandler.cs @@ -1,8 +1,13 @@ using FoundationaLLM.Common.Constants; +using FoundationaLLM.Common.Interfaces; +using FoundationaLLM.Common.Models.TextEmbedding; +using FoundationaLLM.Vectorization.Exceptions; using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models; using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; +using System.Text.Json; namespace FoundationaLLM.Vectorization.Handlers { @@ -35,6 +40,42 @@ protected override async Task ProcessRequest( VectorizationRequest request, VectorizationState state, IConfigurationSection? stepConfiguration, - CancellationToken cancellationToken) => await Task.Delay(TimeSpan.FromSeconds(10)); + CancellationToken cancellationToken) + { + await _stateService.LoadArtifacts(state, VectorizationArtifactType.TextPartition); + + var textPartitioningArtifacts = state.Artifacts.Where(a => a.Type == VectorizationArtifactType.TextPartition).ToList(); + + if (textPartitioningArtifacts == null + || textPartitioningArtifacts.Count == 0) + { + state.Log(this, request.Id, _messageId, "The text partition artifacts were not found."); + return; + } + + var serviceFactory = _serviceProvider.GetService>() + ?? throw new VectorizationException($"Could not retrieve the text embedding service factory instance."); + var textEmbedding = serviceFactory.GetService(_parameters["text_embedding_profile_name"]); + + var embeddingResult = await textEmbedding.GetEmbeddingsAsync( + textPartitioningArtifacts.Select(tpa => tpa.Content!).ToList()); + + var position = 0; + var serializerOptions = new JsonSerializerOptions + { + Converters = + { + new Embedding.JsonConverter() + } + }; + + foreach (var embedding in embeddingResult.Embeddings) + state.AddOrReplaceArtifact(new VectorizationArtifact + { + Type = VectorizationArtifactType.TextEmbeddingVector, + Position = ++position, + Content = JsonSerializer.Serialize(embedding, serializerOptions) + }); + } } } diff --git a/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs b/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs index c87d197873..c3ffe5c605 100644 --- a/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs +++ b/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs @@ -41,9 +41,9 @@ protected override async Task ProcessRequest( IConfigurationSection? stepConfiguration, CancellationToken cancellationToken) { - var serviceFactory = _serviceProvider.GetService>() + var serviceFactory = _serviceProvider.GetService>() ?? throw new VectorizationException($"Could not retrieve the content source service factory instance."); - var contentSource = serviceFactory.CreateService(_parameters["content_source_name"]); + var contentSource = serviceFactory.GetService(_parameters["content_source_profile_name"]); var textContent = await contentSource.ExtractTextFromFileAsync(request.ContentIdentifier.MultipartId, cancellationToken); @@ -53,6 +53,7 @@ protected override async Task ProcessRequest( Position = 1, Content = textContent }); + state.ContentSourceProfileName = _parameters["content_source_profile_name"]; } } } diff --git a/src/dotnet/Vectorization/Handlers/IndexingHandler.cs b/src/dotnet/Vectorization/Handlers/IndexingHandler.cs index 096f13d7dc..c63eb6c94b 100644 --- a/src/dotnet/Vectorization/Handlers/IndexingHandler.cs +++ b/src/dotnet/Vectorization/Handlers/IndexingHandler.cs @@ -1,8 +1,13 @@ using FoundationaLLM.Common.Constants; +using FoundationaLLM.Common.Interfaces; +using FoundationaLLM.Common.Models.TextEmbedding; +using FoundationaLLM.Vectorization.Exceptions; using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models; using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; +using System.Text.Json; namespace FoundationaLLM.Vectorization.Handlers { @@ -35,6 +40,55 @@ protected override async Task ProcessRequest( VectorizationRequest request, VectorizationState state, IConfigurationSection? stepConfiguration, - CancellationToken cancellationToken) => await Task.Delay(TimeSpan.FromSeconds(10)); + CancellationToken cancellationToken) + { + await _stateService.LoadArtifacts(state, VectorizationArtifactType.TextEmbeddingVector); + + var textEmbeddingArtifacts = state.Artifacts.Where(a => a.Type == VectorizationArtifactType.TextEmbeddingVector).ToList(); + + if (textEmbeddingArtifacts == null + || textEmbeddingArtifacts.Count == 0) + { + state.Log(this, request.Id, _messageId, "The text partition artifacts were not found."); + return; + } + + var textPartitioningArtifacts = state.Artifacts.Where(a => a.Type == VectorizationArtifactType.TextPartition).ToList(); + + if (textPartitioningArtifacts == null + || textPartitioningArtifacts.Count == 0) + { + state.Log(this, request.Id, _messageId, "The text partition artifacts were not found."); + return; + } + + var serializerOptions = new JsonSerializerOptions + { + Converters = + { + new Embedding.JsonConverter() + } + }; + + var embeddedContent = new EmbeddedContent + { + ContentId = request.ContentIdentifier, + ContentSourceProfileName = state.ContentSourceProfileName!, + ContentParts = Enumerable.Range(0, textEmbeddingArtifacts.Count) + .Select(i => new EmbeddedContentPart + { + Content = textPartitioningArtifacts[i].Content!, + Embedding = JsonSerializer.Deserialize(textEmbeddingArtifacts[i].Content!, serializerOptions) + }).ToList() + }; + + var serviceFactory = _serviceProvider.GetService>() + ?? throw new VectorizationException($"Could not retrieve the indexing service factory instance."); + var indexing = serviceFactory.GetServiceWithProfile(_parameters["indexing_profile_name"]); + + await indexing.Service.IndexEmbeddingsAsync( + embeddedContent, + indexing.VectorizationProfile.Settings!["IndexName"]); + } } } diff --git a/src/dotnet/Vectorization/Handlers/PartitionHandler.cs b/src/dotnet/Vectorization/Handlers/PartitionHandler.cs index ab7300bded..59c9563688 100644 --- a/src/dotnet/Vectorization/Handlers/PartitionHandler.cs +++ b/src/dotnet/Vectorization/Handlers/PartitionHandler.cs @@ -53,9 +53,9 @@ protected override async Task ProcessRequest( return; } - var serviceFactory = _serviceProvider.GetService>() + var serviceFactory = _serviceProvider.GetService>() ?? throw new VectorizationException($"Could not retrieve the text splitter service factory instance."); - var textSplitter = serviceFactory.CreateService(_parameters["text_partition_profile_name"]); + var textSplitter = serviceFactory.GetService(_parameters["text_partition_profile_name"]); var splitResult = textSplitter.SplitPlainText(extractedTextArtifact.Content!); diff --git a/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerBase.cs b/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerBase.cs index 8e46e6a21b..4031cac567 100644 --- a/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerBase.cs +++ b/src/dotnet/Vectorization/Handlers/VectorizationStepHandlerBase.cs @@ -59,8 +59,10 @@ public class VectorizationStepHandlerBase( public string StepId => _stepId; /// - public async Task Invoke(VectorizationRequest request, VectorizationState state, CancellationToken cancellationToken) + public async Task Invoke(VectorizationRequest request, VectorizationState state, CancellationToken cancellationToken) { + var success = true; + try { state.LogHandlerStart(this, request.Id, _messageId); @@ -93,9 +95,12 @@ public async Task Invoke(VectorizationRequest request, VectorizationState state, } catch (Exception ex) { + success = false; state.LogHandlerError(this, request.Id, _messageId, ex); _logger.LogError(ex, "Error in executing [{HandlerId}] step handler for request {VectorizationRequestId} (message id {MessageId}).", _stepId, request.Id, _messageId); } + + return success; } private void ValidateRequest(VectorizationRequest request) diff --git a/src/dotnet/Vectorization/Interfaces/IVectorizationStepHandler.cs b/src/dotnet/Vectorization/Interfaces/IVectorizationStepHandler.cs index d8f6f39682..b80a1f4f7e 100644 --- a/src/dotnet/Vectorization/Interfaces/IVectorizationStepHandler.cs +++ b/src/dotnet/Vectorization/Interfaces/IVectorizationStepHandler.cs @@ -19,7 +19,7 @@ public interface IVectorizationStepHandler /// The for which the step should be handled. /// The holding the state associated with the vectorization request. /// The to monitor for cancellation requests. - /// - Task Invoke(VectorizationRequest request, VectorizationState state, CancellationToken cancellationToken); + /// True if the vectorization step request was handled successfully. + Task Invoke(VectorizationRequest request, VectorizationState state, CancellationToken cancellationToken); } } diff --git a/src/dotnet/Vectorization/Models/Resources/ContentSource.cs b/src/dotnet/Vectorization/Models/Resources/ContentSourceProfile.cs similarity index 63% rename from src/dotnet/Vectorization/Models/Resources/ContentSource.cs rename to src/dotnet/Vectorization/Models/Resources/ContentSourceProfile.cs index cc3813cbd4..149e92e975 100644 --- a/src/dotnet/Vectorization/Models/Resources/ContentSource.cs +++ b/src/dotnet/Vectorization/Models/Resources/ContentSourceProfile.cs @@ -1,17 +1,13 @@ -using System.Text.Json.Serialization; +using FoundationaLLM.Common.Models.Vectorization; +using System.Text.Json.Serialization; namespace FoundationaLLM.Vectorization.Models.Resources { /// /// Provides detials about a content source. /// - public class ContentSource + public class ContentSourceProfile : VectorizationProfileBase { - /// - /// The name of the content source. - /// - public required string Name { get; set; } - /// /// The type of the content source. /// diff --git a/src/dotnet/Vectorization/Models/Resources/ContentSourceStore.cs b/src/dotnet/Vectorization/Models/Resources/ContentSourceStore.cs index fab5b72937..1f4d9b9e4a 100644 --- a/src/dotnet/Vectorization/Models/Resources/ContentSourceStore.cs +++ b/src/dotnet/Vectorization/Models/Resources/ContentSourceStore.cs @@ -11,6 +11,6 @@ public class ContentSourceStore /// /// The list of all content sources that are registered for use by the vectorization pipelines. /// - public required List ContentSources { get; set; } + public required List ContentSourceProfiles { get; set; } } } diff --git a/src/dotnet/Vectorization/Models/Resources/IndexerType.cs b/src/dotnet/Vectorization/Models/Resources/IndexerType.cs new file mode 100644 index 0000000000..de45c3624b --- /dev/null +++ b/src/dotnet/Vectorization/Models/Resources/IndexerType.cs @@ -0,0 +1,13 @@ +namespace FoundationaLLM.Vectorization.Models.Resources +{ + /// + /// Types of vectori indexes available to store embeddings. + /// + public enum IndexerType + { + /// + /// Indexer using Azure AI Search vector indexes. + /// + AzureAISearchIndexer + } +} diff --git a/src/dotnet/Vectorization/Models/Resources/IndexingProfile.cs b/src/dotnet/Vectorization/Models/Resources/IndexingProfile.cs new file mode 100644 index 0000000000..add495b8a7 --- /dev/null +++ b/src/dotnet/Vectorization/Models/Resources/IndexingProfile.cs @@ -0,0 +1,17 @@ +using FoundationaLLM.Common.Models.Vectorization; +using System.Text.Json.Serialization; + +namespace FoundationaLLM.Vectorization.Models.Resources +{ + /// + /// Provides details about an indexing profile. + /// + public class IndexingProfile : VectorizationProfileBase + { + /// + /// The type of the indexer. + /// + [JsonConverter(typeof(JsonStringEnumConverter))] + public required IndexerType Indexer { get; set; } + } +} diff --git a/src/dotnet/Vectorization/Models/Resources/IndexingProfileStore.cs b/src/dotnet/Vectorization/Models/Resources/IndexingProfileStore.cs new file mode 100644 index 0000000000..583fa0ad6b --- /dev/null +++ b/src/dotnet/Vectorization/Models/Resources/IndexingProfileStore.cs @@ -0,0 +1,13 @@ +namespace FoundationaLLM.Vectorization.Models.Resources +{ + /// + /// Models the content of the indexing profiles store managed by the FoundationaLLM.Vectorization resource provider. + /// + public class IndexingProfileStore + { + /// + /// The list of all indexing profiles that are registered for use by the vectorization pipelines. + /// + public required List IndexingProfiles { get; set; } + } +} diff --git a/src/dotnet/Vectorization/Models/Resources/TextEmbeddingProfile.cs b/src/dotnet/Vectorization/Models/Resources/TextEmbeddingProfile.cs new file mode 100644 index 0000000000..68b4e1db85 --- /dev/null +++ b/src/dotnet/Vectorization/Models/Resources/TextEmbeddingProfile.cs @@ -0,0 +1,17 @@ +using FoundationaLLM.Common.Models.Vectorization; +using System.Text.Json.Serialization; + +namespace FoundationaLLM.Vectorization.Models.Resources +{ + /// + /// Provides details about a text embedding profile. + /// + public class TextEmbeddingProfile : VectorizationProfileBase + { + /// + /// The type of the text splitter. + /// + [JsonConverter(typeof(JsonStringEnumConverter))] + public required TextEmbeddingType TextEmbedding { get; set; } + } +} diff --git a/src/dotnet/Vectorization/Models/Resources/TextEmbeddingProfileStore.cs b/src/dotnet/Vectorization/Models/Resources/TextEmbeddingProfileStore.cs new file mode 100644 index 0000000000..3d293cb8cb --- /dev/null +++ b/src/dotnet/Vectorization/Models/Resources/TextEmbeddingProfileStore.cs @@ -0,0 +1,13 @@ +namespace FoundationaLLM.Vectorization.Models.Resources +{ + /// + /// Models the content of the text embedding profiles store managed by the FoundationaLLM.Vectorization resource provider. + /// + public class TextEmbeddingProfileStore + { + /// + /// The list of all embedding profiles that are registered for use by the vectorization pipelines. + /// + public required List TextEmbeddingProfiles { get; set; } + } +} diff --git a/src/dotnet/Vectorization/Models/Resources/TextEmbeddingType.cs b/src/dotnet/Vectorization/Models/Resources/TextEmbeddingType.cs new file mode 100644 index 0000000000..116a1b1ad2 --- /dev/null +++ b/src/dotnet/Vectorization/Models/Resources/TextEmbeddingType.cs @@ -0,0 +1,19 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FoundationaLLM.Vectorization.Models.Resources +{ + /// + /// Types of text embeddings available for text embedding. + /// + public enum TextEmbeddingType + { + /// + /// Text embedding that uses Semantic Kernel to embed text. + /// + SemanticKernelTextEmbedding + } +} diff --git a/src/dotnet/Vectorization/Models/Resources/TextPartitionProfile.cs b/src/dotnet/Vectorization/Models/Resources/TextPartitionProfile.cs deleted file mode 100644 index 9932654ab2..0000000000 --- a/src/dotnet/Vectorization/Models/Resources/TextPartitionProfile.cs +++ /dev/null @@ -1,26 +0,0 @@ -using System.Text.Json.Serialization; - -namespace FoundationaLLM.Vectorization.Models.Resources -{ - /// - /// Provides details about a text partitioning profile. - /// - public class TextPartitionProfile - { - /// - /// The name of the text partitioning profile. - /// - public required string Name { get; set; } - - /// - /// The type of the text splitter. - /// - [JsonConverter(typeof(JsonStringEnumConverter))] - public required TextSplitterType TextSplitter { get; set; } - - /// - /// The settings used to configure the text splitter. - /// - public Dictionary? TextSplitterSettings { get; set; } - } -} diff --git a/src/dotnet/Vectorization/Models/Resources/TextPartitioningProfile.cs b/src/dotnet/Vectorization/Models/Resources/TextPartitioningProfile.cs new file mode 100644 index 0000000000..18cbcd3914 --- /dev/null +++ b/src/dotnet/Vectorization/Models/Resources/TextPartitioningProfile.cs @@ -0,0 +1,17 @@ +using FoundationaLLM.Common.Models.Vectorization; +using System.Text.Json.Serialization; + +namespace FoundationaLLM.Vectorization.Models.Resources +{ + /// + /// Provides details about a text partitioning profile. + /// + public class TextPartitioningProfile : VectorizationProfileBase + { + /// + /// The type of the text splitter. + /// + [JsonConverter(typeof(JsonStringEnumConverter))] + public required TextSplitterType TextSplitter { get; set; } + } +} diff --git a/src/dotnet/Vectorization/Models/Resources/TextPartitionProfileStore.cs b/src/dotnet/Vectorization/Models/Resources/TextPartitioningProfileStore.cs similarity index 73% rename from src/dotnet/Vectorization/Models/Resources/TextPartitionProfileStore.cs rename to src/dotnet/Vectorization/Models/Resources/TextPartitioningProfileStore.cs index e91e28d46a..1997d8d412 100644 --- a/src/dotnet/Vectorization/Models/Resources/TextPartitionProfileStore.cs +++ b/src/dotnet/Vectorization/Models/Resources/TextPartitioningProfileStore.cs @@ -3,11 +3,11 @@ /// /// Models the content of the text partition profiles store managed by the FoundationaLLM.Vectorization resource provider. /// - public class TextPartitionProfileStore + public class TextPartitioningProfileStore { /// /// The list of all partition profiles that are registered for use by the vectorization pipelines. /// - public required List TextPartitioningProfiles { get; set; } + public required List TextPartitioningProfiles { get; set; } } } diff --git a/src/dotnet/Vectorization/Models/Resources/TextSplitterType.cs b/src/dotnet/Vectorization/Models/Resources/TextSplitterType.cs index c4d6d57a8b..23f2d9eb21 100644 --- a/src/dotnet/Vectorization/Models/Resources/TextSplitterType.cs +++ b/src/dotnet/Vectorization/Models/Resources/TextSplitterType.cs @@ -1,10 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace FoundationaLLM.Vectorization.Models.Resources +namespace FoundationaLLM.Vectorization.Models.Resources { /// /// Types of text splitters available for text partitioning. diff --git a/src/dotnet/Vectorization/Models/VectorizationContentIdentifier.cs b/src/dotnet/Vectorization/Models/VectorizationContentIdentifier.cs deleted file mode 100644 index f5b54bdbc2..0000000000 --- a/src/dotnet/Vectorization/Models/VectorizationContentIdentifier.cs +++ /dev/null @@ -1,37 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Text.Json.Serialization; -using System.Threading.Tasks; - -namespace FoundationaLLM.Vectorization.Models -{ - /// - /// Represents the content associated with a vectorization request. - /// - public class VectorizationContentIdentifier - { - /// - /// The multipart unique identifier of the the content (i.e. document) being vectorized. - /// - [JsonPropertyOrder(1)] - [JsonPropertyName("multipart_id")] - public required List MultipartId { get; set; } - - /// - /// The unique identifier of the content (i.e., document) being vectorized. - /// The identifier is determined by concatenating the parts from . - /// - [JsonIgnore] - public string UniqueId => string.Join("/", MultipartId); - - /// - /// The canonical identifier of the content being vectorized. - /// Vectorization state services use it to derive the location of the state in the underlying storage. - /// - [JsonPropertyOrder(2)] - [JsonPropertyName("canonical_id")] - public required string CanonicalId { get; set; } - } -} diff --git a/src/dotnet/Vectorization/Models/VectorizationRequest.cs b/src/dotnet/Vectorization/Models/VectorizationRequest.cs index 8493999edf..a7fa477784 100644 --- a/src/dotnet/Vectorization/Models/VectorizationRequest.cs +++ b/src/dotnet/Vectorization/Models/VectorizationRequest.cs @@ -1,4 +1,5 @@ -using FoundationaLLM.Vectorization.Exceptions; +using FoundationaLLM.Common.Models.TextEmbedding; +using FoundationaLLM.Vectorization.Exceptions; using System.Text.Json.Serialization; namespace FoundationaLLM.Vectorization.Models @@ -17,11 +18,11 @@ public class VectorizationRequest public required string Id { get; set; } /// - /// The object identifying the content being vectorized. + /// The object identifying the content being vectorized. /// [JsonPropertyOrder(1)] [JsonPropertyName("content_identifier")] - public required VectorizationContentIdentifier ContentIdentifier { get; set; } + public required ContentIdentifier ContentIdentifier { get; set; } /// /// The list of vectorization steps requested by the vectorization request. diff --git a/src/dotnet/Vectorization/Models/VectorizationState.cs b/src/dotnet/Vectorization/Models/VectorizationState.cs index fabc2da0ae..172c14b9f3 100644 --- a/src/dotnet/Vectorization/Models/VectorizationState.cs +++ b/src/dotnet/Vectorization/Models/VectorizationState.cs @@ -1,11 +1,6 @@ -using FoundationaLLM.Vectorization.Interfaces; -using Microsoft.Extensions.Hosting; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; +using FoundationaLLM.Common.Models.TextEmbedding; +using FoundationaLLM.Vectorization.Interfaces; using System.Text.Json.Serialization; -using System.Threading.Tasks; namespace FoundationaLLM.Vectorization.Models { @@ -24,11 +19,11 @@ public class VectorizationState public required string CurrentRequestId { get; set; } /// - /// The object identifying the content being vectorized. + /// The object identifying the content being vectorized. /// [JsonPropertyOrder(1)] [JsonPropertyName("content_identifier")] - public required VectorizationContentIdentifier ContentIdentifier { get; set; } + public required ContentIdentifier ContentIdentifier { get; set; } /// /// The vectorization artifacts associated with the vectorization state. @@ -37,6 +32,13 @@ public class VectorizationState [JsonPropertyName("artifacts")] public List Artifacts { get; set; } = []; + /// + /// The name of the content source profile. + /// + [JsonPropertyOrder(3)] + [JsonPropertyName("content_source_profile_name")] + public string? ContentSourceProfileName { get; set; } + /// /// The list of log entries associated with actions executed by the vectorization pipeline. /// diff --git a/src/dotnet/Vectorization/ResourceProviders/VectorizationResourceProviderService.cs b/src/dotnet/Vectorization/ResourceProviders/VectorizationResourceProviderService.cs index c3de1bf364..a9c47bbb0f 100644 --- a/src/dotnet/Vectorization/ResourceProviders/VectorizationResourceProviderService.cs +++ b/src/dotnet/Vectorization/ResourceProviders/VectorizationResourceProviderService.cs @@ -21,11 +21,15 @@ public class VectorizationResourceProviderService( storageService, logger) { - private Dictionary _contentSources = []; - private Dictionary _textPartitionProfiles = []; + private Dictionary _contentSourceProfiles = []; + private Dictionary _textPartitioningProfiles = []; + private Dictionary _textEmbeddingProfiles = []; + private Dictionary _indexingProfiles = []; - private const string CONTENT_SOURCES_FILE_NAME = "vectorization-content-sources.json"; - private const string TEXT_PARTITION_PROFILES_FILE_NAME = "vectorization-text-partition-profiles.json"; + private const string CONTENT_SOURCE_PROFILES_FILE_NAME = "vectorization-content-source-profiles.json"; + private const string TEXT_PARTITION_PROFILES_FILE_NAME = "vectorization-text-partitioning-profiles.json"; + private const string TEXT_EMBEDDING_PROFILES_FILE_NAME = "vectorization-text-embedding-profiles.json"; + private const string INDEXING_PROFILES_FILE_NAME = "vectorization-indexing-profiles.json"; /// protected override string _name => ResourceProviderNames.FoundationaLLM_Vectorization; @@ -34,12 +38,20 @@ public class VectorizationResourceProviderService( protected override Dictionary _resourceTypes => new Dictionary { { - VectorizationResourceTypeNames.ContentSources, - new ResourceTypeDescriptor(VectorizationResourceTypeNames.ContentSources) + VectorizationResourceTypeNames.ContentSourceProfiles, + new ResourceTypeDescriptor(VectorizationResourceTypeNames.ContentSourceProfiles) }, { - VectorizationResourceTypeNames.TextPartitionProfiles, - new ResourceTypeDescriptor(VectorizationResourceTypeNames.TextPartitionProfiles) + VectorizationResourceTypeNames.TextPartitioningProfiles, + new ResourceTypeDescriptor(VectorizationResourceTypeNames.TextPartitioningProfiles) + }, + { + VectorizationResourceTypeNames.TextEmbeddingProfiles, + new ResourceTypeDescriptor(VectorizationResourceTypeNames.TextEmbeddingProfiles) + }, + { + VectorizationResourceTypeNames.IndexingProfiles, + new ResourceTypeDescriptor(VectorizationResourceTypeNames.IndexingProfiles) } }; @@ -48,25 +60,45 @@ protected override async Task InitializeInternal() { _logger.LogInformation("Starting to initialize the {ResourceProvider} resource provider...", _name); - var contentSourcesFilePath = $"/{_name}/{CONTENT_SOURCES_FILE_NAME}"; + var contentSourceProfilesFilePath = $"/{_name}/{CONTENT_SOURCE_PROFILES_FILE_NAME}"; var partitionProfilesFilePath = $"/{_name}/{TEXT_PARTITION_PROFILES_FILE_NAME}"; + var embeddingProfilesPath = $"/{_name}/{TEXT_EMBEDDING_PROFILES_FILE_NAME}"; + var indexingProfilesPath = $"/{_name}/{INDEXING_PROFILES_FILE_NAME}"; - if (await _storageService.FileExistsAsync(_storageContainerName, contentSourcesFilePath, default)) + if (await _storageService.FileExistsAsync(_storageContainerName, contentSourceProfilesFilePath, default)) { - var fileContent = await _storageService.ReadFileAsync(_storageContainerName, contentSourcesFilePath, default); - var contentSourcesStore = JsonConvert.DeserializeObject( + var fileContent = await _storageService.ReadFileAsync(_storageContainerName, contentSourceProfilesFilePath, default); + var contentSourceProfilesStore = JsonConvert.DeserializeObject( Encoding.UTF8.GetString(fileContent.ToArray())); - _contentSources = contentSourcesStore!.ContentSources.ToDictionary(cs => cs.Name); + _contentSourceProfiles = contentSourceProfilesStore!.ContentSourceProfiles.ToDictionary(cs => cs.Name); } if (await _storageService.FileExistsAsync(_storageContainerName, partitionProfilesFilePath, default)) { var fileContent = await _storageService.ReadFileAsync(_storageContainerName, partitionProfilesFilePath, default); - var textPartitionProfileStore = JsonConvert.DeserializeObject( + var textPartitionProfileStore = JsonConvert.DeserializeObject( + Encoding.UTF8.GetString(fileContent.ToArray())); + + _textPartitioningProfiles = textPartitionProfileStore!.TextPartitioningProfiles.ToDictionary(tpp => tpp.Name); + } + + if (await _storageService.FileExistsAsync(_storageContainerName, embeddingProfilesPath, default)) + { + var fileContent = await _storageService.ReadFileAsync(_storageContainerName, embeddingProfilesPath, default); + var textEmbeddingProfileStore = JsonConvert.DeserializeObject( + Encoding.UTF8.GetString(fileContent.ToArray())); + + _textEmbeddingProfiles = textEmbeddingProfileStore!.TextEmbeddingProfiles.ToDictionary(tep => tep.Name); + } + + if (await _storageService.FileExistsAsync(_storageContainerName, indexingProfilesPath, default)) + { + var fileContent = await _storageService.ReadFileAsync(_storageContainerName, indexingProfilesPath, default); + var indexingProfileStore = JsonConvert.DeserializeObject( Encoding.UTF8.GetString(fileContent.ToArray())); - _textPartitionProfiles = textPartitionProfileStore!.TextPartitioningProfiles.ToDictionary(cs => cs.Name); + _indexingProfiles = indexingProfileStore!.IndexingProfiles.ToDictionary(ip => ip.Name); } _logger.LogInformation("The {ResourceProvider} resource provider was successfully initialized.", _name); @@ -76,34 +108,62 @@ protected override async Task InitializeInternal() protected override T GetResourceInternal(List instances) where T: class => instances[0].ResourceType switch { - VectorizationResourceTypeNames.ContentSources => GetContentSource(instances), - VectorizationResourceTypeNames.TextPartitionProfiles => GetPartitionProfile(instances), + VectorizationResourceTypeNames.ContentSourceProfiles => GetContentSourceProfiles(instances), + VectorizationResourceTypeNames.TextPartitioningProfiles => GetTextPartitioningProfile(instances), + VectorizationResourceTypeNames.TextEmbeddingProfiles => GetTextEmbeddingProfile(instances), + VectorizationResourceTypeNames.IndexingProfiles => GetIndexingProfile(instances), _ => throw new ResourceProviderException($"The resource type {instances[0].ResourceType} is not supported by the {_name} resource manager.") }; - private T GetContentSource(List instances) where T: class + private T GetContentSourceProfiles(List instances) where T: class { if (instances.Count != 1) throw new ResourceProviderException($"Invalid resource path"); - if (typeof(T) != typeof(ContentSource)) + if (typeof(T) != typeof(ContentSourceProfile)) throw new ResourceProviderException($"The type of requested resource ({typeof(T)}) does not match the resource type specified in the path ({instances[0].ResourceType})."); - _contentSources.TryGetValue(instances[0].ResourceId!, out var contentSource); + _contentSourceProfiles.TryGetValue(instances[0].ResourceId!, out var contentSource); return contentSource as T ?? throw new ResourceProviderException($"The resource {instances[0].ResourceId!} of type {instances[0].ResourceType} was not found."); } - private T GetPartitionProfile(List instances) where T: class + private T GetTextPartitioningProfile(List instances) where T: class + { + if (instances.Count != 1) + throw new ResourceProviderException($"Invalid resource path"); + + if (typeof(T) != typeof(TextPartitioningProfile)) + throw new ResourceProviderException($"The type of requested resource ({typeof(T)}) does not match the resource type specified in the path ({instances[0].ResourceType})."); + + _textPartitioningProfiles.TryGetValue(instances[0].ResourceId!, out var textPartitioningProfile); + return textPartitioningProfile as T + ?? throw new ResourceProviderException($"The resource {instances[0].ResourceId!} of type {instances[0].ResourceType} was not found."); + } + + private T GetTextEmbeddingProfile(List instances) where T : class + { + if (instances.Count != 1) + throw new ResourceProviderException($"Invalid resource path"); + + if (typeof(T) != typeof(TextEmbeddingProfile)) + throw new ResourceProviderException($"The type of requested resource ({typeof(T)}) does not match the resource type specified in the path ({instances[0].ResourceType})."); + + _textEmbeddingProfiles.TryGetValue(instances[0].ResourceId!, out var textEmbeddingProfile); + return textEmbeddingProfile as T + ?? throw new ResourceProviderException($"The resource {instances[0].ResourceId!} of type {instances[0].ResourceType} was not found."); + } + + private T GetIndexingProfile(List instances) where T : class { if (instances.Count != 1) throw new ResourceProviderException($"Invalid resource path"); - if (typeof(T) != typeof(TextPartitionProfile)) + if (typeof(T) != typeof(IndexingProfile)) throw new ResourceProviderException($"The type of requested resource ({typeof(T)}) does not match the resource type specified in the path ({instances[0].ResourceType})."); - _textPartitionProfiles.TryGetValue(instances[0].ResourceId!, out var partitionProfile); - return partitionProfile as T + _indexingProfiles.TryGetValue(instances[0].ResourceId!, out var indexingProfile); + return indexingProfile as T ?? throw new ResourceProviderException($"The resource {instances[0].ResourceId!} of type {instances[0].ResourceType} was not found."); } } diff --git a/src/dotnet/Vectorization/ResourceProviders/VectorizationResourceTypeNames.cs b/src/dotnet/Vectorization/ResourceProviders/VectorizationResourceTypeNames.cs index 6ea6fc1fd4..68293111ea 100644 --- a/src/dotnet/Vectorization/ResourceProviders/VectorizationResourceTypeNames.cs +++ b/src/dotnet/Vectorization/ResourceProviders/VectorizationResourceTypeNames.cs @@ -14,11 +14,21 @@ public static class VectorizationResourceTypeNames /// /// Vectorization content sources. /// - public const string ContentSources = "contentSources"; + public const string ContentSourceProfiles = "contentsourceprofiles"; /// /// Text partitioning profiles. /// - public const string TextPartitionProfiles = "textPartitionProfiles"; + public const string TextPartitioningProfiles = "textpartitionprofiles"; + + /// + /// Text embedding profiles. + /// + public const string TextEmbeddingProfiles = "textembeddingprofiles"; + + /// + /// Indexing profiles. + /// + public const string IndexingProfiles = "indexingprofiles"; } } diff --git a/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs b/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs index ddcdf8a39a..bc1781f1dc 100644 --- a/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs +++ b/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs @@ -1,5 +1,6 @@ using FoundationaLLM.Common.Constants; using FoundationaLLM.Common.Interfaces; +using FoundationaLLM.Common.Models.Vectorization; using FoundationaLLM.Common.Settings; using FoundationaLLM.Vectorization.Exceptions; using FoundationaLLM.Vectorization.Interfaces; @@ -27,25 +28,39 @@ namespace FoundationaLLM.Vectorization.Services.ContentSources public class ContentSourceServiceFactory( [FromKeyedServices(DependencyInjectionKeys.FoundationaLLM_Vectorization_ResourceProviderService)] IResourceProviderService vectorizationResourceProviderService, IConfiguration configuration, - ILoggerFactory loggerFactory) : IServiceFactory + ILoggerFactory loggerFactory) : IVectorizationServiceFactory { private readonly IResourceProviderService _vectorizationResourceProviderService = vectorizationResourceProviderService; private readonly IConfiguration _configuration = configuration; private readonly ILoggerFactory _loggerFactory = loggerFactory; /// - public IContentSourceService CreateService(string serviceName) + public IContentSourceService GetService(string serviceName) { - var contentSource = _vectorizationResourceProviderService.GetResource( - $"/{VectorizationResourceTypeNames.ContentSources}/{serviceName}"); + var contentSourceProfile = _vectorizationResourceProviderService.GetResource( + $"/{VectorizationResourceTypeNames.ContentSourceProfiles}/{serviceName}"); - return contentSource.Type switch + return contentSourceProfile.Type switch { ContentSourceType.AzureDataLake => CreateAzureDataLakeContentSourceService(serviceName), - _ => throw new VectorizationException($"The content source type {contentSource.Type} is not supported."), + _ => throw new VectorizationException($"The content source type {contentSourceProfile.Type} is not supported."), }; } + /// + public (IContentSourceService Service, VectorizationProfileBase VectorizationProfile) GetServiceWithProfile(string serviceName) + { + var contentSourceProfile = _vectorizationResourceProviderService.GetResource( + $"/{VectorizationResourceTypeNames.ContentSourceProfiles}/{serviceName}"); + + return contentSourceProfile.Type switch + { + ContentSourceType.AzureDataLake => (CreateAzureDataLakeContentSourceService(serviceName), contentSourceProfile), + _ => throw new VectorizationException($"The content source type {contentSourceProfile.Type} is not supported."), + }; + } + + private DataLakeContentSourceService CreateAzureDataLakeContentSourceService(string serviceName) { var blobStorageServiceSettings = new BlobStorageServiceSettings { AuthenticationType = BlobStorageAuthenticationTypes.Unknown }; diff --git a/src/dotnet/Vectorization/Services/RequestManagerService.cs b/src/dotnet/Vectorization/Services/RequestManagerService.cs index 55e92f1f4f..52f3a181d3 100644 --- a/src/dotnet/Vectorization/Services/RequestManagerService.cs +++ b/src/dotnet/Vectorization/Services/RequestManagerService.cs @@ -108,10 +108,12 @@ private async Task ProcessRequest(VectorizationRequest request, string messageId { try { - await HandleRequest(request, messageId).ConfigureAwait(false); - - await _incomingRequestSourceService.DeleteRequest(messageId, popReceipt).ConfigureAwait(false); - await AdvanceRequest(request).ConfigureAwait(false); + if (await HandleRequest(request, messageId).ConfigureAwait(false)) + { + // If the request was handled successfully, remove it from the current source and advance it to the next step. + await _incomingRequestSourceService.DeleteRequest(messageId, popReceipt).ConfigureAwait(false); + await AdvanceRequest(request).ConfigureAwait(false); + } } catch (Exception ex) { @@ -119,7 +121,7 @@ private async Task ProcessRequest(VectorizationRequest request, string messageId } } - private async Task HandleRequest(VectorizationRequest request, string messageId) + private async Task HandleRequest(VectorizationRequest request, string messageId) { var state = await _vectorizationStateService.HasState(request).ConfigureAwait(false) ? await _vectorizationStateService.ReadState(request).ConfigureAwait(false) @@ -133,9 +135,11 @@ private async Task HandleRequest(VectorizationRequest request, string messageId) _vectorizationStateService, _serviceProvider, _loggerFactory); - await stepHandler.Invoke(request, state, _cancellationToken).ConfigureAwait(false); + var handlerSuccess = await stepHandler.Invoke(request, state, _cancellationToken).ConfigureAwait(false); await _vectorizationStateService.SaveState(state).ConfigureAwait(false); + + return handlerSuccess; } private async Task AdvanceRequest(VectorizationRequest request) diff --git a/src/dotnet/Vectorization/Services/Text/IndexingServiceFactory.cs b/src/dotnet/Vectorization/Services/Text/IndexingServiceFactory.cs new file mode 100644 index 0000000000..36f27c712c --- /dev/null +++ b/src/dotnet/Vectorization/Services/Text/IndexingServiceFactory.cs @@ -0,0 +1,68 @@ +using FoundationaLLM.Common.Constants; +using FoundationaLLM.Common.Interfaces; +using FoundationaLLM.Common.Models.Vectorization; +using FoundationaLLM.Vectorization.Exceptions; +using FoundationaLLM.Vectorization.Models.Resources; +using FoundationaLLM.Vectorization.ResourceProviders; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; + +namespace FoundationaLLM.Vectorization.Services.Text +{ + /// + /// Creates text splitter service instances. + /// + /// The vectorization resource provider service. + /// The global configuration provider. + /// The providing dependency injection services. + /// The logger factory used to create loggers. + public class IndexingServiceFactory( + [FromKeyedServices(DependencyInjectionKeys.FoundationaLLM_Vectorization_ResourceProviderService)] IResourceProviderService vectorizationResourceProviderService, + IConfiguration configuration, + IServiceProvider serviceProvider, + ILoggerFactory loggerFactory) : IVectorizationServiceFactory + { + private readonly IResourceProviderService _vectorizationResourceProviderService = vectorizationResourceProviderService; + private readonly IConfiguration _configuration = configuration; + private readonly IServiceProvider _serviceProvider = serviceProvider; + private readonly ILoggerFactory _loggerFactory = loggerFactory; + + /// + public IIndexingService GetService(string serviceName) + { + var indexingProfile = _vectorizationResourceProviderService.GetResource( + $"/{VectorizationResourceTypeNames.IndexingProfiles}/{serviceName}"); + + return indexingProfile.Indexer switch + { + IndexerType.AzureAISearchIndexer => CreateAzureAISearchIndexingService( + indexingProfile.Settings!["IndexName"]), + _ => throw new VectorizationException($"The text embedding type {indexingProfile.Indexer} is not supported."), + }; + } + + /// + public (IIndexingService Service, VectorizationProfileBase VectorizationProfile) GetServiceWithProfile(string serviceName) + { + var indexingProfile = _vectorizationResourceProviderService.GetResource( + $"/{VectorizationResourceTypeNames.IndexingProfiles}/{serviceName}"); + + return indexingProfile.Indexer switch + { + IndexerType.AzureAISearchIndexer => (CreateAzureAISearchIndexingService( + indexingProfile.Settings!["IndexName"]), indexingProfile), + _ => throw new VectorizationException($"The text embedding type {indexingProfile.Indexer} is not supported."), + }; + } + + private IIndexingService CreateAzureAISearchIndexingService(string indexName) + { + var indexingService = _serviceProvider.GetKeyedService( + DependencyInjectionKeys.FoundationaLLM_Vectorization_AzureAISearchIndexingService) + ?? throw new VectorizationException($"Could not retrieve the Azure AI Search indexing service instance."); + + return indexingService!; + } + } +} diff --git a/src/dotnet/Vectorization/Services/Text/TextEmbeddingServiceFactory.cs b/src/dotnet/Vectorization/Services/Text/TextEmbeddingServiceFactory.cs new file mode 100644 index 0000000000..8550a5d2d8 --- /dev/null +++ b/src/dotnet/Vectorization/Services/Text/TextEmbeddingServiceFactory.cs @@ -0,0 +1,70 @@ +using FoundationaLLM.Common.Constants; +using FoundationaLLM.Common.Interfaces; +using FoundationaLLM.Common.Models.Configuration.Text; +using FoundationaLLM.Common.Models.Vectorization; +using FoundationaLLM.Common.Services.TextSplitters; +using FoundationaLLM.SemanticKernel.Core.Services; +using FoundationaLLM.Vectorization.Exceptions; +using FoundationaLLM.Vectorization.Models.Resources; +using FoundationaLLM.Vectorization.ResourceProviders; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; + +namespace FoundationaLLM.Vectorization.Services.Text +{ + /// + /// Creates text splitter service instances. + /// + /// The vectorization resource provider service. + /// The global configuration provider. + /// The providing dependency injection services. + /// The logger factory used to create loggers. + public class TextEmbeddingServiceFactory( + [FromKeyedServices(DependencyInjectionKeys.FoundationaLLM_Vectorization_ResourceProviderService)] IResourceProviderService vectorizationResourceProviderService, + IConfiguration configuration, + IServiceProvider serviceProvider, + ILoggerFactory loggerFactory) : IVectorizationServiceFactory + { + private readonly IResourceProviderService _vectorizationResourceProviderService = vectorizationResourceProviderService; + private readonly IConfiguration _configuration = configuration; + private readonly IServiceProvider _serviceProvider = serviceProvider; + private readonly ILoggerFactory _loggerFactory = loggerFactory; + + /// + public ITextEmbeddingService GetService(string serviceName) + { + var textEmbeddingProfile = _vectorizationResourceProviderService.GetResource( + $"/{VectorizationResourceTypeNames.TextEmbeddingProfiles}/{serviceName}"); + + return textEmbeddingProfile.TextEmbedding switch + { + TextEmbeddingType.SemanticKernelTextEmbedding => CreateSemanticKernelTextEmbeddingService(), + _ => throw new VectorizationException($"The text embedding type {textEmbeddingProfile.TextEmbedding} is not supported."), + }; + } + + /// + public (ITextEmbeddingService Service, VectorizationProfileBase VectorizationProfile) GetServiceWithProfile(string serviceName) + { + var textEmbeddingProfile = _vectorizationResourceProviderService.GetResource( + $"/{VectorizationResourceTypeNames.TextEmbeddingProfiles}/{serviceName}"); + + return textEmbeddingProfile.TextEmbedding switch + { + TextEmbeddingType.SemanticKernelTextEmbedding => (CreateSemanticKernelTextEmbeddingService(), textEmbeddingProfile), + _ => throw new VectorizationException($"The text embedding type {textEmbeddingProfile.TextEmbedding} is not supported."), + }; + } + + private ITextEmbeddingService CreateSemanticKernelTextEmbeddingService() + { + var textEmbeddingService = _serviceProvider.GetKeyedService( + DependencyInjectionKeys.FoundationaLLM_Vectorization_SemanticKernelTextEmbeddingService) + ?? throw new VectorizationException($"Could not retrieve the Semantic Kernel text embedding service instance."); + + return textEmbeddingService!; + } + } +} diff --git a/src/dotnet/Vectorization/Services/Text/TextSplitterServiceFactory.cs b/src/dotnet/Vectorization/Services/Text/TextSplitterServiceFactory.cs index 979bc558dd..c0ce2c6983 100644 --- a/src/dotnet/Vectorization/Services/Text/TextSplitterServiceFactory.cs +++ b/src/dotnet/Vectorization/Services/Text/TextSplitterServiceFactory.cs @@ -1,7 +1,8 @@ -using Azure.Core; -using FoundationaLLM.Common.Constants; +using FoundationaLLM.Common.Constants; using FoundationaLLM.Common.Interfaces; using FoundationaLLM.Common.Models.Configuration.Text; +using FoundationaLLM.Common.Models.Vectorization; +using FoundationaLLM.Common.Services.TextSplitters; using FoundationaLLM.Vectorization.Exceptions; using FoundationaLLM.Vectorization.Models.Resources; using FoundationaLLM.Vectorization.ResourceProviders; @@ -9,11 +10,6 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; namespace FoundationaLLM.Vectorization.Services.Text { @@ -28,7 +24,7 @@ public class TextSplitterServiceFactory( [FromKeyedServices(DependencyInjectionKeys.FoundationaLLM_Vectorization_ResourceProviderService)] IResourceProviderService vectorizationResourceProviderService, IConfiguration configuration, IServiceProvider serviceProvider, - ILoggerFactory loggerFactory) : IServiceFactory + ILoggerFactory loggerFactory) : IVectorizationServiceFactory { private readonly IResourceProviderService _vectorizationResourceProviderService = vectorizationResourceProviderService; private readonly IConfiguration _configuration = configuration; @@ -36,15 +32,29 @@ public class TextSplitterServiceFactory( private readonly ILoggerFactory _loggerFactory = loggerFactory; /// - public ITextSplitterService CreateService(string serviceName) + public ITextSplitterService GetService(string serviceName) { - var textPartitionProfile = _vectorizationResourceProviderService.GetResource( - $"/{VectorizationResourceTypeNames.TextPartitionProfiles}/{serviceName}"); + var textPartitionProfile = _vectorizationResourceProviderService.GetResource( + $"/{VectorizationResourceTypeNames.TextPartitioningProfiles}/{serviceName}"); return textPartitionProfile.TextSplitter switch { TextSplitterType.TokenTextSplitter => CreateTokenTextSplitterService( - TokenTextSplitterServiceSettings.FromDictionary(textPartitionProfile.TextSplitterSettings!)), + TokenTextSplitterServiceSettings.FromDictionary(textPartitionProfile.Settings!)), + _ => throw new VectorizationException($"The text splitter type {textPartitionProfile.TextSplitter} is not supported."), + }; + } + + /// + public (ITextSplitterService Service, VectorizationProfileBase VectorizationProfile) GetServiceWithProfile(string serviceName) + { + var textPartitionProfile = _vectorizationResourceProviderService.GetResource( + $"/{VectorizationResourceTypeNames.TextPartitioningProfiles}/{serviceName}"); + + return textPartitionProfile.TextSplitter switch + { + TextSplitterType.TokenTextSplitter => (CreateTokenTextSplitterService( + TokenTextSplitterServiceSettings.FromDictionary(textPartitionProfile.Settings!)), textPartitionProfile), _ => throw new VectorizationException($"The text splitter type {textPartitionProfile.TextSplitter} is not supported."), }; } diff --git a/src/dotnet/Vectorization/Services/VectorizationStates/VectorizationStateServiceBase.cs b/src/dotnet/Vectorization/Services/VectorizationStates/VectorizationStateServiceBase.cs index b10df6398e..9da26d43ed 100644 --- a/src/dotnet/Vectorization/Services/VectorizationStates/VectorizationStateServiceBase.cs +++ b/src/dotnet/Vectorization/Services/VectorizationStates/VectorizationStateServiceBase.cs @@ -1,10 +1,6 @@ -using FoundationaLLM.Vectorization.Models; -using System; -using System.Collections.Generic; -using System.Linq; +using FoundationaLLM.Common.Models.TextEmbedding; using System.Security.Cryptography; using System.Text; -using System.Threading.Tasks; namespace FoundationaLLM.Vectorization.Services.VectorizationStates { @@ -16,17 +12,17 @@ public abstract class VectorizationStateServiceBase /// /// Gets the location of the vectorization state based on the content identifier. /// - /// The holding the content identification information. + /// The holding the content identification information. /// - protected string GetPersistenceIdentifier(VectorizationContentIdentifier contentIdentifier) => + protected string GetPersistenceIdentifier(ContentIdentifier contentIdentifier) => $"{contentIdentifier.CanonicalId}_state_{HashContentIdentifier(contentIdentifier)}"; /// /// Computes the MD5 hash of the content identifier. /// - /// The holding the content identification information. + /// The holding the content identification information. /// - protected static string HashContentIdentifier(VectorizationContentIdentifier contentIdentifier) + protected static string HashContentIdentifier(ContentIdentifier contentIdentifier) { var byteHash = MD5.HashData( Encoding.UTF8.GetBytes( diff --git a/src/dotnet/Vectorization/Vectorization.csproj b/src/dotnet/Vectorization/Vectorization.csproj index b142c34fa9..dfc7d1ba8c 100644 --- a/src/dotnet/Vectorization/Vectorization.csproj +++ b/src/dotnet/Vectorization/Vectorization.csproj @@ -20,6 +20,7 @@ + diff --git a/src/dotnet/VectorizationWorker/Program.cs b/src/dotnet/VectorizationWorker/Program.cs index 392f6d4cd0..e7540c0c47 100644 --- a/src/dotnet/VectorizationWorker/Program.cs +++ b/src/dotnet/VectorizationWorker/Program.cs @@ -7,6 +7,8 @@ using FoundationaLLM.Common.Services; using FoundationaLLM.Common.Services.Tokenizers; using FoundationaLLM.Common.Settings; +using FoundationaLLM.SemanticKernel.Core.Models.Configuration; +using FoundationaLLM.SemanticKernel.Core.Services; using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models.Configuration; using FoundationaLLM.Vectorization.ResourceProviders; @@ -64,6 +66,12 @@ DependencyInjectionKeys.FoundationaLLM_Vectorization_ResourceProviderService) .Bind(builder.Configuration.GetSection(AppConfigurationKeySections.FoundationaLLM_Vectorization_ResourceProviderService_Storage)); +builder.Services.AddOptions() + .Bind(builder.Configuration.GetSection(AppConfigurationKeySections.FoundationaLLM_Vectorization_SemanticKernelTextEmbeddingService)); + +builder.Services.AddOptions() + .Bind(builder.Configuration.GetSection(AppConfigurationKeySections.FoundationaLLM_Vectorization_AzureAISearchIndexingService)); + builder.Services.AddSingleton( typeof(IEnumerable), new IConfigurationSection[] { @@ -96,17 +104,33 @@ logger); }); +// Vectorization state builder.Services.AddSingleton(); + +// Vectorization resource provider builder.Services.AddKeyedSingleton( DependencyInjectionKeys.FoundationaLLM_Vectorization_ResourceProviderService); builder.Services.ActivateKeyedSingleton( DependencyInjectionKeys.FoundationaLLM_Vectorization_ResourceProviderService); -builder.Services.AddSingleton, ContentSourceServiceFactory>(); -builder.Services.AddSingleton, TextSplitterServiceFactory>(); +// Service factories +builder.Services.AddSingleton, ContentSourceServiceFactory>(); +builder.Services.AddSingleton, TextSplitterServiceFactory>(); +builder.Services.AddSingleton, TextEmbeddingServiceFactory>(); +builder.Services.AddSingleton, IndexingServiceFactory>(); + +// Tokenizer builder.Services.AddKeyedSingleton(TokenizerServiceNames.MICROSOFT_BPE_TOKENIZER); builder.Services.ActivateKeyedSingleton(TokenizerServiceNames.MICROSOFT_BPE_TOKENIZER); +// Text embedding +builder.Services.AddKeyedSingleton( + DependencyInjectionKeys.FoundationaLLM_Vectorization_SemanticKernelTextEmbeddingService); + +// Indexing +builder.Services.AddKeyedSingleton( + DependencyInjectionKeys.FoundationaLLM_Vectorization_AzureAISearchIndexingService); + builder.Services.AddTransient(); builder.Services.AddHostedService(); diff --git a/src/python/LangChainAPI/LangChainAPI.pyproj b/src/python/LangChainAPI/LangChainAPI.pyproj index 697867af2a..2c67eac325 100644 --- a/src/python/LangChainAPI/LangChainAPI.pyproj +++ b/src/python/LangChainAPI/LangChainAPI.pyproj @@ -24,6 +24,7 @@ . + True true diff --git a/tests/dotnet/SemanticKernel.Tests/SemanticKernel.Tests.csproj b/tests/dotnet/SemanticKernel.Tests/SemanticKernel.Tests.csproj index 9dc5a4ca68..617d193baf 100644 --- a/tests/dotnet/SemanticKernel.Tests/SemanticKernel.Tests.csproj +++ b/tests/dotnet/SemanticKernel.Tests/SemanticKernel.Tests.csproj @@ -26,7 +26,7 @@ - +