diff --git a/src/dotnet/Vectorization/Models/Configuration/AzureSQLDatabaseContentSourceServiceSettings.cs b/src/dotnet/Vectorization/Models/Configuration/AzureSQLDatabaseContentSourceServiceSettings.cs new file mode 100644 index 0000000000..dfe4447331 --- /dev/null +++ b/src/dotnet/Vectorization/Models/Configuration/AzureSQLDatabaseContentSourceServiceSettings.cs @@ -0,0 +1,13 @@ +namespace FoundationaLLM.Vectorization.Models.Configuration +{ + /// + /// Provides configuration settings to initialize a Sql Database content source service. + /// + public class AzureSQLDatabaseContentSourceServiceSettings + { + /// + /// The connection string used for authentication. + /// + public string? ConnectionString { get; set; } + } +} diff --git a/src/dotnet/Vectorization/Models/Resources/ContentSourceType.cs b/src/dotnet/Vectorization/Models/Resources/ContentSourceType.cs index 89266d3eb5..88fb4cc654 100644 --- a/src/dotnet/Vectorization/Models/Resources/ContentSourceType.cs +++ b/src/dotnet/Vectorization/Models/Resources/ContentSourceType.cs @@ -19,6 +19,11 @@ public enum ContentSourceType /// /// SharePoint Online document library. /// - SharePointOnline + SharePointOnline, + + /// + /// Azure SQL Database. + /// + AzureSQLDatabase } } diff --git a/src/dotnet/Vectorization/Services/ContentSources/AzureSQLDatabaseContentSourceService.cs b/src/dotnet/Vectorization/Services/ContentSources/AzureSQLDatabaseContentSourceService.cs new file mode 100644 index 0000000000..4dcf1e24a0 --- /dev/null +++ b/src/dotnet/Vectorization/Services/ContentSources/AzureSQLDatabaseContentSourceService.cs @@ -0,0 +1,87 @@ +using FoundationaLLM.Common.Constants; +using FoundationaLLM.Vectorization.DataFormats.PDF; +using FoundationaLLM.Vectorization.Exceptions; +using FoundationaLLM.Vectorization.Interfaces; +using FoundationaLLM.Vectorization.Models.Configuration; +using Microsoft.Data.SqlClient; +using Microsoft.Extensions.Logging; + +namespace FoundationaLLM.Vectorization.Services.ContentSources +{ + /// + /// Implements a vectorization content source for content residing in Azure SQL. + /// + public class AzureSQLDatabaseContentSourceService : ContentSourceServiceBase, IContentSourceService + { + private readonly ILogger _logger; + private readonly AzureSQLDatabaseContentSourceServiceSettings _settings; + + /// + /// Creates a new instance of the vectorization content source. + /// + public AzureSQLDatabaseContentSourceService( + AzureSQLDatabaseContentSourceServiceSettings settings, + ILoggerFactory loggerFactory) + { + _settings = settings; + _logger = loggerFactory.CreateLogger(); + } + + /// + public async Task ExtractTextFromFileAsync(List multipartId, CancellationToken cancellationToken) + { + ValidateMultipartId(multipartId, 5); + + var binaryContent = await GetBinaryContent( + multipartId[0], + multipartId[1], + multipartId[2], + multipartId[3], + multipartId[4], + cancellationToken); + + return await ExtractTextFromFileAsync(multipartId[4], binaryContent); + } + + /// + /// Retrieves the binary content. + /// + /// The database schema containing the target table. + /// The name of the table from which to retrieve binary content. + /// The name of the column containing binary data. + /// The name of the column used for identifying the specific record. + /// The value identifying the specific record in the identifier column. + /// The cancellation token that signals that operations should be cancelled + /// An object representing the binary contents. + private async Task GetBinaryContent(string schema, string tableName, string contentColumnName, string identifierColumnName, string identifierValue, CancellationToken cancellationToken) + { + try + { + using (var connection = new SqlConnection(_settings.ConnectionString)) + { + await connection.OpenAsync(cancellationToken); + + // WARNING! This is for experimentation purposes only as it is not injection-safe! + // TODO: More work to sanitize and add safety layers against injection. + + using (var command = new SqlCommand($"SELECT TOP 1 {contentColumnName} FROM [{schema}].[{tableName}] WHERE {identifierColumnName} = @identifierValue", connection)) + { + command.Parameters.Add(new SqlParameter("@identifierValue", identifierValue)); + + using (var reader = await command.ExecuteReaderAsync(cancellationToken)) + { + if (!reader.HasRows) + throw new VectorizationException($"The file {identifierValue} was not found in the database."); + await reader.ReadAsync(); + return new BinaryData(reader[contentColumnName]); + } + } + } + } + catch (Exception ex) + { + throw new VectorizationException($"Error when extracting content from file identified by {identifierValue} in Azure SQL Database.", ex); + } + } + } +} diff --git a/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs b/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs index 924f78bc5d..8d39028bee 100644 --- a/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs +++ b/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs @@ -41,6 +41,7 @@ public IContentSourceService GetService(string serviceName) { ContentSourceType.AzureDataLake => CreateAzureDataLakeContentSourceService(serviceName), ContentSourceType.SharePointOnline => CreateSharePointOnlineContentSourceService(serviceName), + ContentSourceType.AzureSQLDatabase => CreateAzureSQLDatabaseContentSourceService(serviceName), _ => throw new VectorizationException($"The content source type {contentSourceProfile.Type} is not supported."), }; } @@ -55,6 +56,7 @@ public IContentSourceService GetService(string serviceName) { ContentSourceType.AzureDataLake => (CreateAzureDataLakeContentSourceService(serviceName), contentSourceProfile), ContentSourceType.SharePointOnline => (CreateSharePointOnlineContentSourceService(serviceName), contentSourceProfile), + ContentSourceType.AzureSQLDatabase => (CreateAzureSQLDatabaseContentSourceService(serviceName), contentSourceProfile), _ => throw new VectorizationException($"The content source type {contentSourceProfile.Type} is not supported."), }; } @@ -83,5 +85,17 @@ private SharePointOnlineContentSourceService CreateSharePointOnlineContentSource sharePointOnlineContentSourceServiceSettings, _loggerFactory); } + + private AzureSQLDatabaseContentSourceService CreateAzureSQLDatabaseContentSourceService(string serviceName) + { + var azureSQLDatabaseContentSourceServiceSettings = new AzureSQLDatabaseContentSourceServiceSettings(); + _configuration.Bind( + $"{AppConfigurationKeySections.FoundationaLLM_Vectorization_ContentSources}:{serviceName}", + azureSQLDatabaseContentSourceServiceSettings); + + return new AzureSQLDatabaseContentSourceService( + azureSQLDatabaseContentSourceServiceSettings, + _loggerFactory); + } } } diff --git a/src/dotnet/Vectorization/Vectorization.csproj b/src/dotnet/Vectorization/Vectorization.csproj index 5c0decc269..0a663fc22c 100644 --- a/src/dotnet/Vectorization/Vectorization.csproj +++ b/src/dotnet/Vectorization/Vectorization.csproj @@ -15,6 +15,7 @@ + diff --git a/src/dotnet/VectorizationAPI/VectorizationAPI.csproj b/src/dotnet/VectorizationAPI/VectorizationAPI.csproj index fb544ce400..0d9a773b7c 100644 --- a/src/dotnet/VectorizationAPI/VectorizationAPI.csproj +++ b/src/dotnet/VectorizationAPI/VectorizationAPI.csproj @@ -4,7 +4,7 @@ net8.0 enable enable - true + false 35b5c460-a49c-4185-a169-676d90673146 FoundationaLLM.Vectorization.API FoundationaLLM.Vectorization.API