Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SharePoint Online vectorization content source #515

Merged
merged 6 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using FoundationaLLM.Vectorization.Exceptions;
using System.Text;

namespace FoundationaLLM.Vectorization.DataFormats.Office
{
/// <summary>
/// Extracts text from DOCX files.
/// </summary>
public class DOCXTextExtractor
{
/// <summary>
/// Extracts the text content from a DOCX document.
/// </summary>
/// <param name="binaryContent">The binary content of the DOCX document.</param>
/// <returns>The text content of the DOCX document.</returns>
public static string GetText(BinaryData binaryContent)
{
StringBuilder sb = new();

using var stream = binaryContent.ToStream();
var wordprocessingDocument = WordprocessingDocument.Open(stream, false);

var mainPart = wordprocessingDocument.MainDocumentPart ?? throw new VectorizationException("The main document part is missing.");
var body = mainPart.Document.Body ?? throw new VectorizationException("The document body is missing.");

var paragraphs = body.Descendants<Paragraph>();
if (paragraphs != null)
foreach (Paragraph p in paragraphs)
sb.AppendLine(p.InnerText);

return sb.ToString().Trim();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Presentation;
using FoundationaLLM.Vectorization.Exceptions;

namespace FoundationaLLM.Vectorization.DataFormats.Office
{
/// <summary>
/// Extracts text from PPTX files.
/// </summary>
public class PPTXTextExtractor
{
/// <summary>
/// Extracts the text content from a PPTX document.
/// </summary>
/// <param name="binaryContent">The binary content of the PPTX document.</param>
/// <returns>The text content of the PPTX document.</returns>
public static string GetText(BinaryData binaryContent) => throw new VectorizationException($"The file type .pptx is not supported.");
}
}
120 changes: 120 additions & 0 deletions src/dotnet/Vectorization/DataFormats/Office/XLSXTextExtractor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
using ClosedXML.Excel;
using System.Text;

namespace FoundationaLLM.Vectorization.DataFormats.Office
{
/// <summary>
/// Extracts text from XLSX files.
/// </summary>
public class XLSXTextExtractor
{
private const string DefaultSheetNumberTemplate = "\n# Worksheet {number}\n";
private const string DefaultEndOfSheetTemplate = "\n# End of worksheet {number}";
private const string DefaultRowPrefix = "";
private const string DefaultColumnSeparator = ", ";
private const string DefaultRowSuffix = "";

private readonly bool _withWorksheetNumber;
private readonly bool _withEndOfWorksheetMarker;
private readonly bool _withQuotes;
private readonly string _worksheetNumberTemplate;
private readonly string _endOfWorksheetMarkerTemplate;
private readonly string _rowPrefix;
private readonly string _columnSeparator;
private readonly string _rowSuffix;

/// <summary>
/// Constructor for XLSXTextExtractor.
/// </summary>
/// <param name="withWorksheetNumber"></param>
/// <param name="withEndOfWorksheetMarker"></param>
/// <param name="withQuotes"></param>
/// <param name="worksheetNumberTemplate"></param>
/// <param name="endOfWorksheetMarkerTemplate"></param>
/// <param name="rowPrefix"></param>
/// <param name="columnSeparator"></param>
/// <param name="rowSuffix"></param>
public XLSXTextExtractor(
bool withWorksheetNumber = true,
bool withEndOfWorksheetMarker = false,
bool withQuotes = true,
string? worksheetNumberTemplate = null,
string? endOfWorksheetMarkerTemplate = null,
string? rowPrefix = null,
string? columnSeparator = null,
string? rowSuffix = null)
{
this._withWorksheetNumber = withWorksheetNumber;
this._withEndOfWorksheetMarker = withEndOfWorksheetMarker;
this._withQuotes = withQuotes;

this._worksheetNumberTemplate = worksheetNumberTemplate ?? DefaultSheetNumberTemplate;
this._endOfWorksheetMarkerTemplate = endOfWorksheetMarkerTemplate ?? DefaultEndOfSheetTemplate;

this._rowPrefix = rowPrefix ?? DefaultRowPrefix;
this._columnSeparator = columnSeparator ?? DefaultColumnSeparator;
this._rowSuffix = rowSuffix ?? DefaultRowSuffix;
}

/// <summary>
/// Extracts the text content from a PPTX document.
/// </summary>
/// <param name="binaryContent">The binary content of the PPTX document.</param>
/// <returns>The text content of the PPTX document.</returns>
public string GetText(BinaryData binaryContent)
{
var sb = new StringBuilder();

using var stream = binaryContent.ToStream();
using var workbook = new XLWorkbook(stream);

var worksheetNumber = 0;
foreach (var worksheet in workbook.Worksheets)
{
worksheetNumber++;
if (this._withWorksheetNumber)
{
sb.AppendLine(this._worksheetNumberTemplate.Replace("{number}", $"{worksheetNumber}", StringComparison.OrdinalIgnoreCase));
}

foreach (IXLRangeRow? row in worksheet.RangeUsed().RowsUsed())
{
if (row == null) { continue; }

var cells = row.CellsUsed().ToList();

sb.Append(this._rowPrefix);
for (var i = 0; i < cells.Count; i++)
{
IXLCell? cell = cells[i];

if (this._withQuotes && cell is { Value.IsText: true })
{
sb.Append('"')
.Append(cell.Value.GetText().Replace("\"", "\"\"", StringComparison.Ordinal))
.Append('"');
}
else
{
sb.Append(cell.Value);
}

if (i < cells.Count - 1)
{
sb.Append(this._columnSeparator);
}
}

sb.AppendLine(this._rowSuffix);
}

if (this._withEndOfWorksheetMarker)
{
sb.AppendLine(this._endOfWorksheetMarkerTemplate.Replace("{number}", $"{worksheetNumber}", StringComparison.OrdinalIgnoreCase));
}
}

return sb.ToString().Trim();
}
}
}
1 change: 0 additions & 1 deletion src/dotnet/Vectorization/Handlers/ExtractionHandler.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using FoundationaLLM.Common.Constants;
using FoundationaLLM.Common.Interfaces;
using FoundationaLLM.Common.Models.Chat;
using FoundationaLLM.Vectorization.Exceptions;
using FoundationaLLM.Vectorization.Interfaces;
using FoundationaLLM.Vectorization.Models;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ public interface IContentSourceService
/// <param name="multipartId">The multipart unique identifier of the file being read.</param>
/// <param name="cancellationToken">The cancellation token that signals that operations should be cancelled.</param>
/// <returns>The string content of the file.</returns>
Task<String> ExtractTextFromFileAsync(List<string> multipartId, CancellationToken cancellationToken);
Task<string> ExtractTextFromFileAsync(List<string> multipartId, CancellationToken cancellationToken);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
namespace FoundationaLLM.Vectorization.Models.Configuration
{
/// <summary>
/// Provides configuration settings to initialize a SharePoint Online content source service.
/// </summary>
public class SharePointOnlineContentSourceServiceSettings
{
/// <summary>
/// The application (client) identifier of the Azure app registration.
/// </summary>
public string? ClientId { get; set; }

/// <summary>
/// The Azure tenant identifier where the app was registered.
/// </summary>
public string? TenantId { get; set; }

/// <summary>
/// The Azure KeyVault URL in which the X.509 certificate is stored.
/// </summary>
public string? KeyVaultURL { get; set; }

/// <summary>
/// The name of the X.509 certificate stored in Azure KeyVault.
/// </summary>
public string? CertificateName { get; set; }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ public enum ContentSourceType
/// <summary>
/// Azure data lake storage account.
/// </summary>
AzureDataLake
AzureDataLake,

/// <summary>
/// SharePoint Online document library.
/// </summary>
SharePointOnline
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
using FoundationaLLM.Vectorization.Exceptions;
using FoundationaLLM.Common.Constants;
using FoundationaLLM.Vectorization.DataFormats.Office;
using FoundationaLLM.Vectorization.DataFormats.PDF;
using FoundationaLLM.Vectorization.Exceptions;

namespace FoundationaLLM.Vectorization.Services.ContentSources
{
Expand All @@ -13,12 +16,38 @@ public class ContentSourceServiceBase
/// <param name="multipartId">The multipart identifier to validate.</param>
/// <param name="partsCount">The required number of parts in the multipart identifier.</param>
/// <exception cref="VectorizationException"></exception>
public void ValidateMultipartId(List<string> multipartId, int partsCount)
public static void ValidateMultipartId(List<string> multipartId, int partsCount)
{
if (multipartId == null
|| multipartId.Count != partsCount
|| multipartId.Any(t => string.IsNullOrWhiteSpace(t)))
throw new VectorizationException("Invalid multipart identifier.");
}

/// <summary>
/// Reads the binary content of a specified file from the storage.
/// </summary>
/// <param name="fileName">The file name of the file being extracted.</param>
/// <param name="binaryContent">The binary data of the file being extracted.</param>
/// <returns>The string content of the file.</returns>
/// <exception cref="VectorizationException"></exception>
public static async Task<string> ExtractTextFromFileAsync(string fileName, BinaryData binaryContent)
{
await Task.CompletedTask;

var fileExtension = Path.GetExtension(fileName);

return fileExtension.ToLower() switch
{
FileExtensions.Text => binaryContent.ToString(),
FileExtensions.Markdown => binaryContent.ToString(),
FileExtensions.JSON => binaryContent.ToString(),
FileExtensions.Word => DOCXTextExtractor.GetText(binaryContent),
FileExtensions.Excel => new XLSXTextExtractor().GetText(binaryContent),
FileExtensions.PowerPoint => PPTXTextExtractor.GetText(binaryContent),
FileExtensions.PDF => PDFTextExtractor.GetText(binaryContent),
_ => throw new VectorizationException($"The file type for {fileName} is not supported."),
};
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@
using FoundationaLLM.Common.Settings;
using FoundationaLLM.Vectorization.Exceptions;
using FoundationaLLM.Vectorization.Interfaces;
using FoundationaLLM.Vectorization.Models;
using FoundationaLLM.Vectorization.Models.Configuration;
using FoundationaLLM.Vectorization.Models.Resources;
using FoundationaLLM.Vectorization.ResourceProviders;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;

namespace FoundationaLLM.Vectorization.Services.ContentSources
{
Expand Down Expand Up @@ -43,6 +40,7 @@ public IContentSourceService GetService(string serviceName)
return contentSourceProfile.Type switch
{
ContentSourceType.AzureDataLake => CreateAzureDataLakeContentSourceService(serviceName),
ContentSourceType.SharePointOnline => CreateSharePointOnlineContentSourceService(serviceName),
_ => throw new VectorizationException($"The content source type {contentSourceProfile.Type} is not supported."),
};
}
Expand All @@ -56,6 +54,7 @@ public IContentSourceService GetService(string serviceName)
return contentSourceProfile.Type switch
{
ContentSourceType.AzureDataLake => (CreateAzureDataLakeContentSourceService(serviceName), contentSourceProfile),
ContentSourceType.SharePointOnline => (CreateSharePointOnlineContentSourceService(serviceName), contentSourceProfile),
_ => throw new VectorizationException($"The content source type {contentSourceProfile.Type} is not supported."),
};
}
Expand All @@ -72,5 +71,17 @@ private DataLakeContentSourceService CreateAzureDataLakeContentSourceService(str
blobStorageServiceSettings,
_loggerFactory);
}

private SharePointOnlineContentSourceService CreateSharePointOnlineContentSourceService(string serviceName)
{
var sharePointOnlineContentSourceServiceSettings = new SharePointOnlineContentSourceServiceSettings();
_configuration.Bind(
$"{AppConfigurationKeySections.FoundationaLLM_Vectorization_ContentSources}:{serviceName}",
sharePointOnlineContentSourceServiceSettings);

return new SharePointOnlineContentSourceService(
sharePointOnlineContentSourceServiceSettings,
_loggerFactory);
}
}
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
using FoundationaLLM.Common.Constants;
using FoundationaLLM.Common.Services;
using FoundationaLLM.Common.Services;
using FoundationaLLM.Common.Settings;
using FoundationaLLM.Vectorization.DataFormats.PDF;
using FoundationaLLM.Vectorization.Exceptions;
using FoundationaLLM.Vectorization.Interfaces;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;

namespace FoundationaLLM.Vectorization.Services.ContentSources
{
Expand Down Expand Up @@ -33,7 +29,7 @@ public DataLakeContentSourceService(
}

/// <inheritdoc/>
public async Task<String> ExtractTextFromFileAsync(List<string> multipartId, CancellationToken cancellationToken)
public async Task<string> ExtractTextFromFileAsync(List<string> multipartId, CancellationToken cancellationToken)
{
ValidateMultipartId(multipartId, 3);

Expand All @@ -42,13 +38,7 @@ public async Task<String> ExtractTextFromFileAsync(List<string> multipartId, Can
multipartId[2],
cancellationToken);

var fileExtension = Path.GetExtension(multipartId[2]);

return fileExtension.ToLower() switch
{
FileExtensions.PDF => PDFTextExtractor.GetText(binaryContent),
_ => throw new VectorizationException($"The file type for {multipartId[2]} is not supported."),
};
return await ExtractTextFromFileAsync(multipartId[2], binaryContent);
}
}
}
Loading
Loading