Skip to content

Commit

Permalink
Merge pull request #967 from solliancenet/cp-e2e-tests
Browse files Browse the repository at this point in the history
SharePoint Online vectorization e2e examples
  • Loading branch information
ciprianjichici authored May 14, 2024
2 parents 2fe1aa3 + d1cee0f commit 02481ac
Show file tree
Hide file tree
Showing 10 changed files with 590 additions and 10 deletions.
4 changes: 2 additions & 2 deletions tests/dotnet/Core.Examples/Catalogs/DataSourceCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ public static class DataSourceCatalog
{
public static readonly List<DataSourceBase> Items =
[
new AzureDataLakeDataSource { Name = "datalake_vectorization_input", DisplayName = "datalake_vectorization_input", ConfigurationReferences = new Dictionary<string, string> { { "AuthenticationType", "FoundationaLLM:DataSources:datalake_vectorization_input:AuthenticationType" }, { "AccountName", "FoundationaLLM:DataSources:datalake_vectorization_input:AccountName" } }, Folders = new List<string> { "vectorization-input" } }

new AzureDataLakeDataSource { Name = "datalake_vectorization_input", DisplayName = "datalake_vectorization_input", ConfigurationReferences = new Dictionary<string, string> { { "AuthenticationType", "FoundationaLLM:DataSources:datalake_vectorization_input:AuthenticationType" }, { "AccountName", "FoundationaLLM:DataSources:datalake_vectorization_input:AccountName" } }, Folders = new List<string> { "vectorization-input" } },
new SharePointOnlineSiteDataSource { Name = "sharepoint_fllm", DisplayName="sharepoint_fllm", SiteUrl="https://fllm.sharepoint.com/sites/FoundationaLLM", DocumentLibraries=["/documents02"], ConfigurationReferences = new Dictionary<string, string>{ {"ClientId", "FoundationaLLM:DataSources:sharepoint_fllm:ClientId" },{"TenantId", "FoundationaLLM:DataSources:sharepoint_fllm:TenantId" },{"CertificateName", "FoundationaLLM:DataSources:sharepoint_fllm:CertificateName" },{ "KeyVaultURL", "FoundationaLLM:DataSources:sharepoint_fllm:KeyVaultURL" } } }
];

public static List<DataSourceBase> GetDataSources()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ public static class IndexingProfilesCatalog
public static readonly List<IndexingProfile> Items =
[
new IndexingProfile { Name = "indexing_profile_really_big", Indexer = IndexerType.AzureAISearchIndexer, Settings = new Dictionary<string, string>{ { "IndexName", "reallybig" }, { "TopN", "3" }, { "Filters", "" }, { "EmbeddingFieldName", "Embedding" }, { "TextFieldName", "Text" } }, ConfigurationReferences = new Dictionary<string, string>{ { "AuthenticationType", "FoundationaLLM:Vectorization:AzureAISearchIndexingService:AuthenticationType" }, { "Endpoint", "FoundationaLLM:Vectorization:AzureAISearchIndexingService:Endpoint" } } },
new IndexingProfile { Name = "indexing_profile_pdf_datalake", Indexer = IndexerType.AzureAISearchIndexer, Settings = new Dictionary<string, string>{ { "IndexName", "pdfdatalake" }, { "TopN", "3" }, { "Filters", "" }, { "EmbeddingFieldName", "Embedding" }, { "TextFieldName", "Text" } }, ConfigurationReferences = new Dictionary<string, string>{ { "AuthenticationType", "FoundationaLLM:Vectorization:AzureAISearchIndexingService:AuthenticationType" }, { "Endpoint", "FoundationaLLM:Vectorization:AzureAISearchIndexingService:Endpoint" } } }
new IndexingProfile { Name = "indexing_profile_pdf", Indexer = IndexerType.AzureAISearchIndexer, Settings = new Dictionary<string, string>{ { "IndexName", "pdf" }, { "TopN", "3" }, { "Filters", "" }, { "EmbeddingFieldName", "Embedding" }, { "TextFieldName", "Text" } }, ConfigurationReferences = new Dictionary<string, string>{ { "AuthenticationType", "FoundationaLLM:Vectorization:AzureAISearchIndexingService:AuthenticationType" }, { "Endpoint", "FoundationaLLM:Vectorization:AzureAISearchIndexingService:Endpoint" } } }
];

public static List<IndexingProfile> GetIndexingProfiles()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,20 @@ namespace FoundationaLLM.Core.Examples
/// FoundationaLLM:DataSources:datalake_vectorization_input:AccountName
/// Expects the following document in the storage account:
/// /vectorization-input/SDZWA-Journal-January-2024.pdf
/// References:
/// PDF public source: https://sandiegozoowildlifealliance.org/Journal/january-2024
/// </summary>
public class Example0004_SynchronousVectorizationOfPDFFromDataLake: BaseTest, IClassFixture<TestFixture>
{
private readonly IVectorizationTestService _vectorizationTestService;
private InstanceSettings _instanceSettings;
private string containerName = "vectorization-input";
private string containerName = "vectorization-input";
private string blobName = "SDZWA-Journal-January-2024.pdf";
private string dataSourceName = "datalake_vectorization_input";
private string dataSourceObjectId = String.Empty;
private string textPartitioningProfileName = "text_partition_profile";
private string textEmbeddingProfileName = "text_embedding_profile_generic";
private string indexingProfileName = "indexing_profile_pdf_datalake";
private string indexingProfileName = "indexing_profile_pdf";
private string searchString = "Kurt and Ollie";
private string id = String.Empty;
private BlobStorageServiceSettings? _settings;
Expand Down Expand Up @@ -60,6 +62,8 @@ private async Task RunExampleAsync()
WriteLine($"Create the data source: {dataSourceName} via the Management API");
await _vectorizationTestService.CreateDataSource(dataSourceName);

Thread.Sleep(5000); // processing too quickly, pause after the creation of the data source

WriteLine($"Create the vectorization text partitioning profile: {textPartitioningProfileName} via the Management API");
await _vectorizationTestService.CreateTextPartitioningProfile(textPartitioningProfileName);

Expand Down Expand Up @@ -141,7 +145,7 @@ private async Task RunExampleAsync()
WriteLine($"Delete the vectorization text embedding profile: {textEmbeddingProfileName} via the Management API");
await _vectorizationTestService.DeleteTextEmbeddingProfile(textEmbeddingProfileName);

WriteLine($"Delete the vectorization indexing profile: {indexingProfileName} via the Management API");
WriteLine($"Delete the vectorization indexing profile: {indexingProfileName} via the Management API and delete the created index");
await _vectorizationTestService.DeleteIndexingProfile(indexingProfileName, true);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,20 @@ namespace FoundationaLLM.Core.Examples
/// FoundationaLLM:DataSources:datalake_vectorization_input:AccountName
/// Expects the following document in the storage account:
/// /vectorization-input/SDZWA-Journal-January-2024.pdf
/// References:
/// PDF public source: https://sandiegozoowildlifealliance.org/Journal/january-2024
/// </summary>
public class Example0005_AsynchronousVectorizationOfPDFFromDataLake: BaseTest, IClassFixture<TestFixture>
{
private readonly IVectorizationTestService _vectorizationTestService;
private InstanceSettings _instanceSettings;
private string containerName = "vectorization-input";
private string containerName = "vectorization-input";
private string blobName = "SDZWA-Journal-January-2024.pdf";
private string dataSourceName = "datalake_vectorization_input";
private string dataSourceObjectId = String.Empty;
private string textPartitioningProfileName = "text_partition_profile";
private string textEmbeddingProfileName = "text_embedding_profile_generic";
private string indexingProfileName = "indexing_profile_pdf_datalake";
private string indexingProfileName = "indexing_profile_pdf";
private string searchString = "Kurt and Ollie";
private string id = String.Empty;
private BlobStorageServiceSettings? _settings;
Expand Down Expand Up @@ -60,6 +62,8 @@ private async Task RunExampleAsync()
WriteLine($"Create the data source: {dataSourceName} via the Management API");
await _vectorizationTestService.CreateDataSource(dataSourceName);

Thread.Sleep(5000); // processing too quickly, pause after the creation of the data source

WriteLine($"Create the vectorization text partitioning profile: {textPartitioningProfileName} via the Management API");
await _vectorizationTestService.CreateTextPartitioningProfile(textPartitioningProfileName);

Expand Down Expand Up @@ -163,7 +167,7 @@ private async Task RunExampleAsync()
WriteLine($"Delete the vectorization text embedding profile: {textEmbeddingProfileName} via the Management API");
await _vectorizationTestService.DeleteTextEmbeddingProfile(textEmbeddingProfileName);

WriteLine($"Delete the vectorization indexing profile: {indexingProfileName} via the Management API");
WriteLine($"Delete the vectorization indexing profile: {indexingProfileName} via the Management API along with the index");
await _vectorizationTestService.DeleteIndexingProfile(indexingProfileName, true);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
using FoundationaLLM.Common.Constants.ResourceProviders;
using FoundationaLLM.Common.Models.Configuration.Instance;
using FoundationaLLM.Common.Models.ResourceProviders.Vectorization;
using FoundationaLLM.Common.Models.Vectorization;
using FoundationaLLM.Core.Examples.Interfaces;
using FoundationaLLM.Core.Examples.Models;
using FoundationaLLM.Core.Examples.Setup;
using Xunit.Abstractions;

namespace FoundationaLLM.Core.Examples
{
/// <summary>
/// Example class for running synchronous vectorization over a PDF file in a SharePoint Online Site document library.
/// Expects the following configuration values:
/// FoundationaLLM:DataSources:sharepoint_fllm:ClientId
/// FoundationaLLM:DataSources:sharepoint_fllm:TenantId
/// FoundationaLLM:DataSources:sharepoint_fllm:CertificateName
/// FoundationaLLM:DataSources:sharepoint_fllm:KeyVaultURL
/// References:
/// PDF public source: https://sandiegozoowildlifealliance.org/Journal/january-2024
/// </summary>
public class Example0008_SynchronousVectorizationOfPDFFromSharePoint: BaseTest, IClassFixture<TestFixture>
{
private readonly IVectorizationTestService _vectorizationTestService;
private InstanceSettings _instanceSettings;
private string dataSourceName = "sharepoint_fllm";
private string dataSourceObjectId = String.Empty;
private string textPartitioningProfileName = "text_partition_profile";
private string textEmbeddingProfileName = "text_embedding_profile_generic";
private string indexingProfileName = "indexing_profile_pdf";
private string searchString = "Kurt and Ollie";
private string id = String.Empty;
private SharePointVectorizationConfiguration _sharePointVectorizationConfiguration;

public Example0008_SynchronousVectorizationOfPDFFromSharePoint(ITestOutputHelper output, TestFixture fixture)
: base(output, fixture.ServiceProvider)
{
_vectorizationTestService = GetService<IVectorizationTestService>();
_instanceSettings = _vectorizationTestService.InstanceSettings;
dataSourceObjectId = $"/instances/{_instanceSettings.Id}/providers/FoundationaLLM.DataSource/dataSources/{dataSourceName}";
id = Guid.NewGuid().ToString();
_sharePointVectorizationConfiguration = TestConfiguration.SharePointVectorizationConfiguration;
}

[Fact]
public async Task RunAsync()
{
WriteLine("============ Synchronous Vectorization of a PDF from SharePoint Online ============");
await RunExampleAsync();
}

private async Task RunExampleAsync()
{
WriteLine($"Create the data source: {dataSourceName} via the Management API");
await _vectorizationTestService.CreateDataSource(dataSourceName);

Thread.Sleep(5000); // processing too quickly, pause after the creation of the data source

WriteLine($"Create the vectorization text partitioning profile: {textPartitioningProfileName} via the Management API");
await _vectorizationTestService.CreateTextPartitioningProfile(textPartitioningProfileName);

WriteLine($"Create the vectorization text embedding profile: {textEmbeddingProfileName} via the Management API");
await _vectorizationTestService.CreateTextEmbeddingProfile(textEmbeddingProfileName);

WriteLine($"Create the vectorization indexing profile: {indexingProfileName} via the Management API");
await _vectorizationTestService.CreateIndexingProfile(indexingProfileName);

// used for defining the canonical id, optionally remove the file extension
var fileNameWithoutExtension = _sharePointVectorizationConfiguration.FileName.Substring(0, _sharePointVectorizationConfiguration.FileName.LastIndexOf('.'));

ContentIdentifier ci = new ContentIdentifier
{
DataSourceObjectId = dataSourceObjectId,
MultipartId = new List<string>
{
_sharePointVectorizationConfiguration.HostName,
_sharePointVectorizationConfiguration.SitePath,
_sharePointVectorizationConfiguration.FolderPath,
_sharePointVectorizationConfiguration.FileName
},
CanonicalId = $"{_sharePointVectorizationConfiguration.FolderPath}/{fileNameWithoutExtension}"
};

WriteLine($"Create the vectorization request: {id} via the Management API");
List<VectorizationStep> steps =
[
new VectorizationStep { Id = "extract", Parameters = new Dictionary<string, string>() },
new VectorizationStep { Id = "partition", Parameters = new Dictionary<string, string>() { { "text_partitioning_profile_name", textPartitioningProfileName } } },
new VectorizationStep { Id = "embed", Parameters = new Dictionary<string, string>() { { "text_embedding_profile_name", textEmbeddingProfileName } } },
new VectorizationStep { Id = "index", Parameters = new Dictionary<string, string>() { { "indexing_profile_name", indexingProfileName } } },
];
var request = new VectorizationRequest
{
RemainingSteps = new List<string> { "extract", "partition", "embed", "index" },
CompletedSteps = new List<string>(),
ProcessingType = VectorizationProcessingType.Synchronous,
ContentIdentifier = ci,
Id = id,
Steps = steps,
ObjectId = $"{VectorizationResourceTypeNames.VectorizationRequests}/{id}"
};
//Create the vectorization request, re-assign the fully qualified object id if desired.
request.ObjectId = await _vectorizationTestService.CreateVectorizationRequest(request);

WriteLine($"Verify the vectorization request {id} was created by retrieving it from the Management API");
var resource = await _vectorizationTestService.GetVectorizationRequest(request);
if (resource == null)
throw new Exception("Vectorization request failed creation. Invalid result was returned.");

WriteLine($"Issue the process action on the vectorization request: {id} via the Management API");
var vectorizationResult = await _vectorizationTestService.ProcessVectorizationRequest(request);

// Ensure the vectorization request was successful
if (vectorizationResult == null)
throw new Exception("Vectorization request failed to complete successfully. Invalid result was returned.");

if (vectorizationResult.IsSuccess == false)
{
//retrieve more verbose error logging from resource....
resource = await _vectorizationTestService.GetVectorizationRequest(request);
throw new Exception($"Vectorization request failed to complete successfully. Message(s):\n{string.Join("\n", resource.ErrorMessages)}");
}

WriteLine($"Vectorization request: {id} completed successfully.");

WriteLine($"Verify a search yields 27 documents.");
TestSearchResult result = await _vectorizationTestService.QueryIndex(indexingProfileName, textEmbeddingProfileName, searchString);
if(result.QueryResult.TotalCount!=27)
throw new Exception($"Query did not return the expected number of query results. Expected: 27, Retrieved: {result.QueryResult.TotalCount}");
if(result.VectorResults.TotalCount!=27)
throw new Exception($"Query did not return the expected number of vector results. Expected: 27, Retrieved: {result.VectorResults.TotalCount}");

WriteLine($"Delete the data source: {dataSourceName} via the Management API");
await _vectorizationTestService.DeleteDataSource(dataSourceName);

WriteLine($"Delete the vectorization text partitioning profile: {textPartitioningProfileName} via the Management API");
await _vectorizationTestService.DeleteTextPartitioningProfile(textPartitioningProfileName);

WriteLine($"Delete the vectorization text embedding profile: {textEmbeddingProfileName} via the Management API");
await _vectorizationTestService.DeleteTextEmbeddingProfile(textEmbeddingProfileName);

WriteLine($"Delete the vectorization indexing profile: {indexingProfileName} via the Management API along with the index");
await _vectorizationTestService.DeleteIndexingProfile(indexingProfileName, true);

}
}
}
Loading

0 comments on commit 02481ac

Please sign in to comment.