Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Azure SQL Database vectorization content source #516

Merged
merged 4 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
namespace FoundationaLLM.Vectorization.Models.Configuration
{
/// <summary>
/// Provides configuration settings to initialize a Sql Database content source service.
/// </summary>
public class AzureSQLDatabaseContentSourceServiceSettings
{
/// <summary>
/// The connection string used for authentication.
/// </summary>
public string? ConnectionString { get; set; }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ public enum ContentSourceType
/// <summary>
/// SharePoint Online document library.
/// </summary>
SharePointOnline
SharePointOnline,

/// <summary>
/// Azure SQL Database.
/// </summary>
AzureSQLDatabase
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
using FoundationaLLM.Common.Constants;
using FoundationaLLM.Vectorization.DataFormats.PDF;
using FoundationaLLM.Vectorization.Exceptions;
using FoundationaLLM.Vectorization.Interfaces;
using FoundationaLLM.Vectorization.Models.Configuration;
using Microsoft.Data.SqlClient;
using Microsoft.Extensions.Logging;

namespace FoundationaLLM.Vectorization.Services.ContentSources
{
/// <summary>
/// Implements a vectorization content source for content residing in Azure SQL.
/// </summary>
public class AzureSQLDatabaseContentSourceService : ContentSourceServiceBase, IContentSourceService
{
private readonly ILogger<AzureSQLDatabaseContentSourceService> _logger;
private readonly AzureSQLDatabaseContentSourceServiceSettings _settings;

/// <summary>
/// Creates a new instance of the vectorization content source.
/// </summary>
public AzureSQLDatabaseContentSourceService(
AzureSQLDatabaseContentSourceServiceSettings settings,
ILoggerFactory loggerFactory)
{
_settings = settings;
_logger = loggerFactory.CreateLogger<AzureSQLDatabaseContentSourceService>();
}

/// <inheritdoc/>
public async Task<string> ExtractTextFromFileAsync(List<string> multipartId, CancellationToken cancellationToken)
{
ValidateMultipartId(multipartId, 5);

var binaryContent = await GetBinaryContent(
multipartId[0],
multipartId[1],
multipartId[2],
multipartId[3],
multipartId[4],
cancellationToken);

return await ExtractTextFromFileAsync(multipartId[4], binaryContent);
}

/// <summary>
/// Retrieves the binary content.
/// </summary>
/// <param name="schema">The database schema containing the target table.</param>
/// <param name="tableName">The name of the table from which to retrieve binary content.</param>
/// <param name="contentColumnName">The name of the column containing binary data.</param>
/// <param name="identifierColumnName">The name of the column used for identifying the specific record.</param>
/// <param name="identifierValue">The value identifying the specific record in the identifier column.</param>
/// <param name="cancellationToken">The cancellation token that signals that operations should be cancelled</param>
/// <returns>An object representing the binary contents.</returns>
private async Task<BinaryData> GetBinaryContent(string schema, string tableName, string contentColumnName, string identifierColumnName, string identifierValue, CancellationToken cancellationToken)
{
try
{
using (var connection = new SqlConnection(_settings.ConnectionString))
{
await connection.OpenAsync(cancellationToken);

// WARNING! This is for experimentation purposes only as it is not injection-safe!
// TODO: More work to sanitize and add safety layers against injection.

using (var command = new SqlCommand($"SELECT TOP 1 {contentColumnName} FROM [{schema}].[{tableName}] WHERE {identifierColumnName} = @identifierValue", connection))
{
command.Parameters.Add(new SqlParameter("@identifierValue", identifierValue));

using (var reader = await command.ExecuteReaderAsync(cancellationToken))
{
if (!reader.HasRows)
throw new VectorizationException($"The file {identifierValue} was not found in the database.");
await reader.ReadAsync();
return new BinaryData(reader[contentColumnName]);
}
}
}
}
catch (Exception ex)
{
throw new VectorizationException($"Error when extracting content from file identified by {identifierValue} in Azure SQL Database.", ex);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public IContentSourceService GetService(string serviceName)
{
ContentSourceType.AzureDataLake => CreateAzureDataLakeContentSourceService(serviceName),
ContentSourceType.SharePointOnline => CreateSharePointOnlineContentSourceService(serviceName),
ContentSourceType.AzureSQLDatabase => CreateAzureSQLDatabaseContentSourceService(serviceName),
_ => throw new VectorizationException($"The content source type {contentSourceProfile.Type} is not supported."),
};
}
Expand All @@ -55,6 +56,7 @@ public IContentSourceService GetService(string serviceName)
{
ContentSourceType.AzureDataLake => (CreateAzureDataLakeContentSourceService(serviceName), contentSourceProfile),
ContentSourceType.SharePointOnline => (CreateSharePointOnlineContentSourceService(serviceName), contentSourceProfile),
ContentSourceType.AzureSQLDatabase => (CreateAzureSQLDatabaseContentSourceService(serviceName), contentSourceProfile),
_ => throw new VectorizationException($"The content source type {contentSourceProfile.Type} is not supported."),
};
}
Expand Down Expand Up @@ -83,5 +85,17 @@ private SharePointOnlineContentSourceService CreateSharePointOnlineContentSource
sharePointOnlineContentSourceServiceSettings,
_loggerFactory);
}

private AzureSQLDatabaseContentSourceService CreateAzureSQLDatabaseContentSourceService(string serviceName)
{
var azureSQLDatabaseContentSourceServiceSettings = new AzureSQLDatabaseContentSourceServiceSettings();
_configuration.Bind(
$"{AppConfigurationKeySections.FoundationaLLM_Vectorization_ContentSources}:{serviceName}",
azureSQLDatabaseContentSourceServiceSettings);

return new AzureSQLDatabaseContentSourceService(
azureSQLDatabaseContentSourceServiceSettings,
_loggerFactory);
}
}
}
1 change: 1 addition & 0 deletions src/dotnet/Vectorization/Vectorization.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
<PackageReference Include="Azure.Security.KeyVault.Secrets" Version="4.5.0" />
<PackageReference Include="Azure.Storage.Blobs" Version="12.19.1" />
<PackageReference Include="Azure.Storage.Queues" Version="12.17.1" />
<PackageReference Include="Microsoft.Data.SqlClient" Version="5.1.4" />
<PackageReference Include="ClosedXML" Version="0.104.0-preview2" />
<PackageReference Include="ClosedXML.Parser" Version="1.2.0" />
<PackageReference Include="DocumentFormat.OpenXml" Version="3.0.1" />
Expand Down
2 changes: 1 addition & 1 deletion src/dotnet/VectorizationAPI/VectorizationAPI.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<InvariantGlobalization>true</InvariantGlobalization>
<InvariantGlobalization>false</InvariantGlobalization>
<UserSecretsId>35b5c460-a49c-4185-a169-676d90673146</UserSecretsId>
<AssemblyName>FoundationaLLM.Vectorization.API</AssemblyName>
<RootNamespace>FoundationaLLM.Vectorization.API</RootNamespace>
Expand Down
Loading