Skip to content

Commit

Permalink
SharePoint Online vectorization content source
Browse files Browse the repository at this point in the history
  • Loading branch information
Ciprian Jichici committed Jan 29, 2024
1 parent e30cb2c commit 6260a9e
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 27 deletions.
1 change: 0 additions & 1 deletion src/dotnet/Vectorization/Handlers/ExtractionHandler.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using FoundationaLLM.Common.Constants;
using FoundationaLLM.Common.Interfaces;
using FoundationaLLM.Common.Models.Chat;
using FoundationaLLM.Vectorization.Exceptions;
using FoundationaLLM.Vectorization.Interfaces;
using FoundationaLLM.Vectorization.Models;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@
using FoundationaLLM.Common.Settings;
using FoundationaLLM.Vectorization.Exceptions;
using FoundationaLLM.Vectorization.Interfaces;
using FoundationaLLM.Vectorization.Models;
using FoundationaLLM.Vectorization.Models.Configuration;
using FoundationaLLM.Vectorization.Models.Resources;
using FoundationaLLM.Vectorization.ResourceProviders;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;

namespace FoundationaLLM.Vectorization.Services.ContentSources
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
using FoundationaLLM.Common.Constants;
using FoundationaLLM.Common.Services;
using FoundationaLLM.Common.Services;
using FoundationaLLM.Common.Settings;
using FoundationaLLM.Vectorization.DataFormats.PDF;
using FoundationaLLM.Vectorization.Exceptions;
using FoundationaLLM.Vectorization.Interfaces;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;

namespace FoundationaLLM.Vectorization.Services.ContentSources
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
using FoundationaLLM.Vectorization.Exceptions;
using FoundationaLLM.Vectorization.Interfaces;
using FoundationaLLM.Vectorization.Models.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using PnP.Framework;
using PnP.Framework.Modernization.Cache;
using PnP.Core.Auth;
using PnP.Core.Services.Builder.Configuration;
using PnP.Core.Services;
using System.Security.Cryptography.X509Certificates;
using System;
using PnP.Core.Model.SharePoint;

namespace FoundationaLLM.Vectorization.Services.ContentSources
{
Expand All @@ -18,6 +22,7 @@ public class SharePointOnlineContentSourceService : ContentSourceServiceBase, IC
{
private readonly SharePointOnlineContentSourceServiceSettings _settings;
private readonly ILogger<SharePointOnlineContentSourceService> _logger;
private ServiceProvider? _serviceProvider;

/// <summary>
/// Creates a new instance of the vectorization content source.
Expand All @@ -34,35 +39,40 @@ public SharePointOnlineContentSourceService(
public async Task<string> ExtractTextFromFileAsync(List<string> multipartId, CancellationToken cancellationToken)
{
ValidateMultipartId(multipartId, 4);
await EnsureServiceProvider(multipartId);

var binaryContent = await GetDocumentBinaryContent(
$"{multipartId[0]}/{multipartId[1]}",
$"{multipartId[1]}/{multipartId[2]}/{multipartId[3]}",
$"{multipartId[2]}/{multipartId[3]}",
cancellationToken);

return await ExtractTextFromFileAsync(multipartId[2], binaryContent);
return await ExtractTextFromFileAsync(multipartId[3], binaryContent);
}

/// <summary>
/// Retrieves the binary content of the specified SharePoint Online document.
/// </summary>
/// <param name="documentLibrarySiteUrl">The url of the document library.</param>

/// <param name="documentRelativeUrl">The server relative url of the document.</param>
/// <param name="cancellationToken">The cancellation token that signals that operations should be cancelled.</param>
/// <returns>An object representing the binary contents of the retrieved document.</returns>
private async Task<BinaryData> GetDocumentBinaryContent(string documentLibrarySiteUrl, string documentRelativeUrl, CancellationToken cancellationToken)
private async Task<BinaryData> GetDocumentBinaryContent(string documentRelativeUrl, CancellationToken cancellationToken)
{
X509Certificate2 certificate = await GetCertificate();

var authManager = new AuthenticationManager(_settings.ClientId, certificate, _settings.TenantId);
using (var cc = authManager!.GetContext(documentLibrarySiteUrl, cancellationToken))
using (var scope = _serviceProvider!.CreateScope())
{
var file = cc.Web.GetFileByServerRelativeUrl(documentRelativeUrl);
var stream = file.OpenBinaryStream();
var bytes = stream.ToByteArray();
var pnpContextFactory = scope.ServiceProvider.GetRequiredService<IPnPContextFactory>();

using (var context = await pnpContextFactory.CreateAsync("Default"))
{
string documentUrl = $"{context.Uri.PathAndQuery}/{documentRelativeUrl}";
// Get a reference to the file
IFile testDocument = await context.Web.GetFileByServerRelativeUrlAsync(documentUrl);

Stream downloadedContentStream = await testDocument.GetContentAsync(true);
var binaryData = await BinaryData.FromStreamAsync(downloadedContentStream, cancellationToken);

return new BinaryData(bytes);
};
return binaryData;
}
}
}

/// <summary>
Expand Down Expand Up @@ -101,5 +111,31 @@ private void ValidateSettings()
if (string.IsNullOrWhiteSpace(_settings.CertificateName))
throw new VectorizationException("Missing CertificateName in the SharePointOnlineContentSourceService configuration settings.");
}

private async Task EnsureServiceProvider(List<string> multipartId)
{
if (_serviceProvider == null)
{
var certificate = await GetCertificate();
var services = new ServiceCollection();
services.AddLogging();
services.AddPnPCore(async options =>
{
var authProvider = new X509CertificateAuthenticationProvider(
_settings.ClientId,
_settings.TenantId,
certificate);
options.DefaultAuthenticationProvider = authProvider;
options.Sites.Add("Default",
new PnPCoreSiteOptions
{
SiteUrl = $"{multipartId[0]}/{multipartId[1]}",
AuthenticationProvider = authProvider
});
});
services.AddPnPContextFactory();
_serviceProvider = services.BuildServiceProvider();
}
}
}
}
6 changes: 4 additions & 2 deletions src/dotnet/Vectorization/Vectorization.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
<PackageReference Include="Azure.Security.KeyVault.Secrets" Version="4.5.0" />
<PackageReference Include="Azure.Storage.Blobs" Version="12.19.1" />
<PackageReference Include="Azure.Storage.Queues" Version="12.17.1" />
<PackageReference Include="ClosedXML" Version="0.102.2" />
<PackageReference Include="ClosedXML" Version="0.104.0-preview2" />
<PackageReference Include="ClosedXML.Parser" Version="1.2.0" />
<PackageReference Include="DocumentFormat.OpenXml" Version="3.0.1" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="8.0.0" />
<PackageReference Include="PdfPig" Version="0.1.8" />
<PackageReference Include="PnP.Framework" Version="1.14.0" />
<PackageReference Include="PnP.Core" Version="1.11.0" />
<PackageReference Include="PnP.Core.Auth" Version="1.11.0" />
<PackageReference Include="System.Memory.Data" Version="8.0.0" />
</ItemGroup>

Expand Down

0 comments on commit 6260a9e

Please sign in to comment.