diff --git a/src/dotnet/Common/Interfaces/IServiceFactory`1.cs b/src/dotnet/Common/Interfaces/IVectorizationServiceFactory`1.cs similarity index 100% rename from src/dotnet/Common/Interfaces/IServiceFactory`1.cs rename to src/dotnet/Common/Interfaces/IVectorizationServiceFactory`1.cs diff --git a/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs b/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs index 316cb20c61..ca0b584b31 100644 --- a/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs +++ b/src/dotnet/Vectorization/Handlers/ExtractionHandler.cs @@ -1,6 +1,5 @@ using FoundationaLLM.Common.Constants; using FoundationaLLM.Common.Interfaces; -using FoundationaLLM.Common.Models.Chat; using FoundationaLLM.Vectorization.Exceptions; using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models; diff --git a/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs b/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs index 10b00c964f..924f78bc5d 100644 --- a/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs +++ b/src/dotnet/Vectorization/Services/ContentSources/ContentSourceServiceFactory.cs @@ -4,15 +4,12 @@ using FoundationaLLM.Common.Settings; using FoundationaLLM.Vectorization.Exceptions; using FoundationaLLM.Vectorization.Interfaces; -using FoundationaLLM.Vectorization.Models; using FoundationaLLM.Vectorization.Models.Configuration; using FoundationaLLM.Vectorization.Models.Resources; using FoundationaLLM.Vectorization.ResourceProviders; -using Microsoft.AspNetCore.Mvc; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; namespace FoundationaLLM.Vectorization.Services.ContentSources { diff --git a/src/dotnet/Vectorization/Services/ContentSources/DataLakeContentSourceService.cs b/src/dotnet/Vectorization/Services/ContentSources/DataLakeContentSourceService.cs index 925f119c85..5311e21968 100644 --- a/src/dotnet/Vectorization/Services/ContentSources/DataLakeContentSourceService.cs +++ b/src/dotnet/Vectorization/Services/ContentSources/DataLakeContentSourceService.cs @@ -1,11 +1,7 @@ -using FoundationaLLM.Common.Constants; -using FoundationaLLM.Common.Services; +using FoundationaLLM.Common.Services; using FoundationaLLM.Common.Settings; -using FoundationaLLM.Vectorization.DataFormats.PDF; -using FoundationaLLM.Vectorization.Exceptions; using FoundationaLLM.Vectorization.Interfaces; using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; namespace FoundationaLLM.Vectorization.Services.ContentSources { diff --git a/src/dotnet/Vectorization/Services/ContentSources/SharePointOnlineContentSourceService.cs b/src/dotnet/Vectorization/Services/ContentSources/SharePointOnlineContentSourceService.cs index 9e6023d7c8..314a7b2428 100644 --- a/src/dotnet/Vectorization/Services/ContentSources/SharePointOnlineContentSourceService.cs +++ b/src/dotnet/Vectorization/Services/ContentSources/SharePointOnlineContentSourceService.cs @@ -4,10 +4,14 @@ using FoundationaLLM.Vectorization.Exceptions; using FoundationaLLM.Vectorization.Interfaces; using FoundationaLLM.Vectorization.Models.Configuration; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; -using PnP.Framework; -using PnP.Framework.Modernization.Cache; +using PnP.Core.Auth; +using PnP.Core.Services.Builder.Configuration; +using PnP.Core.Services; using System.Security.Cryptography.X509Certificates; +using System; +using PnP.Core.Model.SharePoint; namespace FoundationaLLM.Vectorization.Services.ContentSources { @@ -18,6 +22,7 @@ public class SharePointOnlineContentSourceService : ContentSourceServiceBase, IC { private readonly SharePointOnlineContentSourceServiceSettings _settings; private readonly ILogger _logger; + private ServiceProvider? _serviceProvider; /// /// Creates a new instance of the vectorization content source. @@ -34,35 +39,40 @@ public SharePointOnlineContentSourceService( public async Task ExtractTextFromFileAsync(List multipartId, CancellationToken cancellationToken) { ValidateMultipartId(multipartId, 4); + await EnsureServiceProvider(multipartId); var binaryContent = await GetDocumentBinaryContent( - $"{multipartId[0]}/{multipartId[1]}", - $"{multipartId[1]}/{multipartId[2]}/{multipartId[3]}", + $"{multipartId[2]}/{multipartId[3]}", cancellationToken); - return await ExtractTextFromFileAsync(multipartId[2], binaryContent); + return await ExtractTextFromFileAsync(multipartId[3], binaryContent); } /// /// Retrieves the binary content of the specified SharePoint Online document. /// - /// The url of the document library. + /// The server relative url of the document. /// The cancellation token that signals that operations should be cancelled. /// An object representing the binary contents of the retrieved document. - private async Task GetDocumentBinaryContent(string documentLibrarySiteUrl, string documentRelativeUrl, CancellationToken cancellationToken) + private async Task GetDocumentBinaryContent(string documentRelativeUrl, CancellationToken cancellationToken) { - X509Certificate2 certificate = await GetCertificate(); - - var authManager = new AuthenticationManager(_settings.ClientId, certificate, _settings.TenantId); - using (var cc = authManager!.GetContext(documentLibrarySiteUrl, cancellationToken)) + using (var scope = _serviceProvider!.CreateScope()) { - var file = cc.Web.GetFileByServerRelativeUrl(documentRelativeUrl); - var stream = file.OpenBinaryStream(); - var bytes = stream.ToByteArray(); + var pnpContextFactory = scope.ServiceProvider.GetRequiredService(); + + using (var context = await pnpContextFactory.CreateAsync("Default")) + { + string documentUrl = $"{context.Uri.PathAndQuery}/{documentRelativeUrl}"; + // Get a reference to the file + IFile testDocument = await context.Web.GetFileByServerRelativeUrlAsync(documentUrl); + + Stream downloadedContentStream = await testDocument.GetContentAsync(true); + var binaryData = await BinaryData.FromStreamAsync(downloadedContentStream, cancellationToken); - return new BinaryData(bytes); - }; + return binaryData; + } + } } /// @@ -101,5 +111,31 @@ private void ValidateSettings() if (string.IsNullOrWhiteSpace(_settings.CertificateName)) throw new VectorizationException("Missing CertificateName in the SharePointOnlineContentSourceService configuration settings."); } + + private async Task EnsureServiceProvider(List multipartId) + { + if (_serviceProvider == null) + { + var certificate = await GetCertificate(); + var services = new ServiceCollection(); + services.AddLogging(); + services.AddPnPCore(async options => + { + var authProvider = new X509CertificateAuthenticationProvider( + _settings.ClientId, + _settings.TenantId, + certificate); + options.DefaultAuthenticationProvider = authProvider; + options.Sites.Add("Default", + new PnPCoreSiteOptions + { + SiteUrl = $"{multipartId[0]}/{multipartId[1]}", + AuthenticationProvider = authProvider + }); + }); + services.AddPnPContextFactory(); + _serviceProvider = services.BuildServiceProvider(); + } + } } } diff --git a/src/dotnet/Vectorization/Vectorization.csproj b/src/dotnet/Vectorization/Vectorization.csproj index 579ca81d86..5c0decc269 100644 --- a/src/dotnet/Vectorization/Vectorization.csproj +++ b/src/dotnet/Vectorization/Vectorization.csproj @@ -15,11 +15,13 @@ - + + - + +