Skip to content

Commit

Permalink
Merge pull request #516 from solliancenet/aa-azure-sql-content-source
Browse files Browse the repository at this point in the history
Azure SQL Database vectorization content source
  • Loading branch information
joelhulen authored Jan 29, 2024
2 parents 65b1501 + 1780ba6 commit 2843550
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
namespace FoundationaLLM.Vectorization.Models.Configuration
{
/// <summary>
/// Provides configuration settings to initialize a Sql Database content source service.
/// </summary>
public class AzureSQLDatabaseContentSourceServiceSettings
{
/// <summary>
/// The connection string used for authentication.
/// </summary>
public string? ConnectionString { get; set; }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ public enum ContentSourceType
/// <summary>
/// SharePoint Online document library.
/// </summary>
SharePointOnline
SharePointOnline,

/// <summary>
/// Azure SQL Database.
/// </summary>
AzureSQLDatabase
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
using FoundationaLLM.Common.Constants;
using FoundationaLLM.Vectorization.DataFormats.PDF;
using FoundationaLLM.Vectorization.Exceptions;
using FoundationaLLM.Vectorization.Interfaces;
using FoundationaLLM.Vectorization.Models.Configuration;
using Microsoft.Data.SqlClient;
using Microsoft.Extensions.Logging;

namespace FoundationaLLM.Vectorization.Services.ContentSources
{
/// <summary>
/// Implements a vectorization content source for content residing in Azure SQL.
/// </summary>
public class AzureSQLDatabaseContentSourceService : ContentSourceServiceBase, IContentSourceService
{
private readonly ILogger<AzureSQLDatabaseContentSourceService> _logger;
private readonly AzureSQLDatabaseContentSourceServiceSettings _settings;

/// <summary>
/// Creates a new instance of the vectorization content source.
/// </summary>
public AzureSQLDatabaseContentSourceService(
AzureSQLDatabaseContentSourceServiceSettings settings,
ILoggerFactory loggerFactory)
{
_settings = settings;
_logger = loggerFactory.CreateLogger<AzureSQLDatabaseContentSourceService>();
}

/// <inheritdoc/>
public async Task<string> ExtractTextFromFileAsync(List<string> multipartId, CancellationToken cancellationToken)
{
ValidateMultipartId(multipartId, 5);

var binaryContent = await GetBinaryContent(
multipartId[0],
multipartId[1],
multipartId[2],
multipartId[3],
multipartId[4],
cancellationToken);

return await ExtractTextFromFileAsync(multipartId[4], binaryContent);
}

/// <summary>
/// Retrieves the binary content.
/// </summary>
/// <param name="schema">The database schema containing the target table.</param>
/// <param name="tableName">The name of the table from which to retrieve binary content.</param>
/// <param name="contentColumnName">The name of the column containing binary data.</param>
/// <param name="identifierColumnName">The name of the column used for identifying the specific record.</param>
/// <param name="identifierValue">The value identifying the specific record in the identifier column.</param>
/// <param name="cancellationToken">The cancellation token that signals that operations should be cancelled</param>
/// <returns>An object representing the binary contents.</returns>
private async Task<BinaryData> GetBinaryContent(string schema, string tableName, string contentColumnName, string identifierColumnName, string identifierValue, CancellationToken cancellationToken)
{
try
{
using (var connection = new SqlConnection(_settings.ConnectionString))
{
await connection.OpenAsync(cancellationToken);

// WARNING! This is for experimentation purposes only as it is not injection-safe!
// TODO: More work to sanitize and add safety layers against injection.

using (var command = new SqlCommand($"SELECT TOP 1 {contentColumnName} FROM [{schema}].[{tableName}] WHERE {identifierColumnName} = @identifierValue", connection))
{
command.Parameters.Add(new SqlParameter("@identifierValue", identifierValue));

using (var reader = await command.ExecuteReaderAsync(cancellationToken))
{
if (!reader.HasRows)
throw new VectorizationException($"The file {identifierValue} was not found in the database.");
await reader.ReadAsync();
return new BinaryData(reader[contentColumnName]);
}
}
}
}
catch (Exception ex)
{
throw new VectorizationException($"Error when extracting content from file identified by {identifierValue} in Azure SQL Database.", ex);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public IContentSourceService GetService(string serviceName)
{
ContentSourceType.AzureDataLake => CreateAzureDataLakeContentSourceService(serviceName),
ContentSourceType.SharePointOnline => CreateSharePointOnlineContentSourceService(serviceName),
ContentSourceType.AzureSQLDatabase => CreateAzureSQLDatabaseContentSourceService(serviceName),
_ => throw new VectorizationException($"The content source type {contentSourceProfile.Type} is not supported."),
};
}
Expand All @@ -55,6 +56,7 @@ public IContentSourceService GetService(string serviceName)
{
ContentSourceType.AzureDataLake => (CreateAzureDataLakeContentSourceService(serviceName), contentSourceProfile),
ContentSourceType.SharePointOnline => (CreateSharePointOnlineContentSourceService(serviceName), contentSourceProfile),
ContentSourceType.AzureSQLDatabase => (CreateAzureSQLDatabaseContentSourceService(serviceName), contentSourceProfile),
_ => throw new VectorizationException($"The content source type {contentSourceProfile.Type} is not supported."),
};
}
Expand Down Expand Up @@ -83,5 +85,17 @@ private SharePointOnlineContentSourceService CreateSharePointOnlineContentSource
sharePointOnlineContentSourceServiceSettings,
_loggerFactory);
}

private AzureSQLDatabaseContentSourceService CreateAzureSQLDatabaseContentSourceService(string serviceName)
{
var azureSQLDatabaseContentSourceServiceSettings = new AzureSQLDatabaseContentSourceServiceSettings();
_configuration.Bind(
$"{AppConfigurationKeySections.FoundationaLLM_Vectorization_ContentSources}:{serviceName}",
azureSQLDatabaseContentSourceServiceSettings);

return new AzureSQLDatabaseContentSourceService(
azureSQLDatabaseContentSourceServiceSettings,
_loggerFactory);
}
}
}
1 change: 1 addition & 0 deletions src/dotnet/Vectorization/Vectorization.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
<PackageReference Include="Azure.Security.KeyVault.Secrets" Version="4.5.0" />
<PackageReference Include="Azure.Storage.Blobs" Version="12.19.1" />
<PackageReference Include="Azure.Storage.Queues" Version="12.17.1" />
<PackageReference Include="Microsoft.Data.SqlClient" Version="5.1.4" />
<PackageReference Include="ClosedXML" Version="0.104.0-preview2" />
<PackageReference Include="ClosedXML.Parser" Version="1.2.0" />
<PackageReference Include="DocumentFormat.OpenXml" Version="3.0.1" />
Expand Down
2 changes: 1 addition & 1 deletion src/dotnet/VectorizationAPI/VectorizationAPI.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<InvariantGlobalization>true</InvariantGlobalization>
<InvariantGlobalization>false</InvariantGlobalization>
<UserSecretsId>35b5c460-a49c-4185-a169-676d90673146</UserSecretsId>
<AssemblyName>FoundationaLLM.Vectorization.API</AssemblyName>
<RootNamespace>FoundationaLLM.Vectorization.API</RootNamespace>
Expand Down

0 comments on commit 2843550

Please sign in to comment.