From 331657b5e1ed30dc4e7273ef2afcc6c825b84fe0 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 15 Oct 2024 17:39:23 -0400 Subject: [PATCH] Update Echo engine to use toolkit Fix bug with pretranslating all; begin porting tests to toolkit Another small logic fix; update tests to reflect not generating pretranslations for segments with target text Fix issue with mapping non-parallel-corpora to parallel corpora Move to service; address scripture alignment issue --- Serval.sln | 15 + .../TranslationEngineServiceV1.cs | 249 +++++------ src/Echo/src/EchoTranslationEngine/Usings.cs | 1 + .../IMachineBuilderExtensions.cs | 6 + .../IServiceCollectionExtensions.cs | 2 +- .../Services/NmtPreprocessBuildJob.cs | 6 +- .../Services/PreprocessBuildJob.cs | 399 ++---------------- .../ServalTranslationEngineServiceV1.cs | 17 +- .../Services/SmtTransferPreprocessBuildJob.cs | 6 +- .../src/Serval.Machine.Shared/Usings.cs | 2 +- .../Services/NmtEngineServiceTests.cs | 8 +- .../Services/PreprocessBuildJobTests.cs | 99 +++-- .../Services/SmtTransferEngineServiceTests.cs | 8 +- .../Serval.Machine.Shared.Tests/Usings.cs | 2 + .../src/Serval.Shared/Serval.Shared.csproj | 2 +- .../Services/EngineService.cs | 12 +- .../Serval.E2ETests/ServalClientHelper.cs | 63 +-- .../IHealthChecksBuilderExtensions.cs | 4 +- .../IServiceCollectionsExtensions.cs | 11 + .../SIL.ServiceToolkit}/Models/CorpusFile.cs | 2 +- .../Models/MonolingualCorpus.cs | 2 +- .../Models/ParallelCorpus.cs | 2 +- .../src/SIL.ServiceToolkit/Models/Row.cs | 3 + .../SIL.ServiceToolkit.csproj | 5 + .../Services/CorpusService.cs | 2 +- .../Services/ICorpusService.cs | 2 +- .../IParallelCorpusPreprocessingService.cs | 11 + .../ParallelCorpusPreprocessingService.cs | 370 ++++++++++++++++ .../src/SIL.ServiceToolkit/Usings.cs | 7 + .../SIL.ServiceToolkit.Tests.csproj | 32 ++ .../Services/ParallelCorpusProcessorTests.cs | 98 +++++ .../Services/data/source1.txt | 7 + .../Services/data/source2.txt | 7 + .../Services/data/target1.txt | 7 + .../test/SIL.ServiceToolkit.Tests/Usings.cs | 2 + 35 files changed, 865 insertions(+), 606 deletions(-) create mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs rename src/{Machine/src/Serval.Machine.Shared => ServiceToolkit/src/SIL.ServiceToolkit}/Models/CorpusFile.cs (84%) rename src/{Machine/src/Serval.Machine.Shared => ServiceToolkit/src/SIL.ServiceToolkit}/Models/MonolingualCorpus.cs (92%) rename src/{Machine/src/Serval.Machine.Shared => ServiceToolkit/src/SIL.ServiceToolkit}/Models/ParallelCorpus.cs (87%) create mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Models/Row.cs rename src/{Machine/src/Serval.Machine.Shared => ServiceToolkit/src/SIL.ServiceToolkit}/Services/CorpusService.cs (97%) rename src/{Machine/src/Serval.Machine.Shared => ServiceToolkit/src/SIL.ServiceToolkit}/Services/ICorpusService.cs (81%) create mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs create mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs create mode 100644 src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/SIL.ServiceToolkit.Tests.csproj create mode 100644 src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessorTests.cs create mode 100644 src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source1.txt create mode 100644 src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source2.txt create mode 100644 src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/target1.txt create mode 100644 src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs diff --git a/Serval.sln b/Serval.sln index edd3f075..e094172a 100644 --- a/Serval.sln +++ b/Serval.sln @@ -86,6 +86,14 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C3A14577-A65 EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit", "src\ServiceToolkit\src\SIL.ServiceToolkit\SIL.ServiceToolkit.csproj", "{0E40F959-C641-40A2-9750-B17A4F9F9E55}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{32B63C4B-AECD-4499-ADFB-69EF581B4F4C}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ServiceToolkit", "ServiceToolkit", "{76123A14-29A5-480D-942E-FE00D6474D50}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit.Tests\SIL.ServiceToolkit.Tests.csproj", "{C50ED15A-876D-42BF-980A-388E8C49C78D}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -180,6 +188,10 @@ Global {0E40F959-C641-40A2-9750-B17A4F9F9E55}.Debug|Any CPU.Build.0 = Debug|Any CPU {0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.ActiveCfg = Release|Any CPU {0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.Build.0 = Release|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -215,6 +227,9 @@ Global {10657805-48F1-4205-B8F5-79447F6EF620} = {25CDB05B-4E24-4A6E-933E-1E0BEC97D74D} {C3A14577-A654-4604-818C-4E683DD45A51} = {EA69B41C-49EF-4017-A687-44B9DF37FF98} {0E40F959-C641-40A2-9750-B17A4F9F9E55} = {C3A14577-A654-4604-818C-4E683DD45A51} + {76123A14-29A5-480D-942E-FE00D6474D50} = {32B63C4B-AECD-4499-ADFB-69EF581B4F4C} + {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} = {76123A14-29A5-480D-942E-FE00D6474D50} + {C50ED15A-876D-42BF-980A-388E8C49C78D} = {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {9F18C25E-E140-43C3-B177-D562E1628370} diff --git a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs index 254fe0af..fb7abc66 100644 --- a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs +++ b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs @@ -1,10 +1,16 @@ namespace EchoTranslationEngine; -public class TranslationEngineServiceV1(BackgroundTaskQueue taskQueue) : TranslationEngineApi.TranslationEngineApiBase +public class TranslationEngineServiceV1( + BackgroundTaskQueue taskQueue, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService +) : TranslationEngineApi.TranslationEngineApiBase { private static readonly Empty Empty = new(); private readonly BackgroundTaskQueue _taskQueue = taskQueue; + private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService = + parallelCorpusPreprocessingService; + public override Task Create(CreateRequest request, ServerCallContext context) { if (request.SourceLanguage != request.TargetLanguage) @@ -75,159 +81,34 @@ await client.BuildStartedAsync( try { + List pretranslationsRequests = []; + _parallelCorpusPreprocessingService.Preprocess( + request.Corpora.Select(Map).ToList(), + row => { }, + (row, corpus) => + { + pretranslationsRequests.Add( + new InsertPretranslationsRequest + { + EngineId = request.EngineId, + CorpusId = corpus.Id, + TextId = row.TextId, + Refs = { row.Refs.Select(r => r.ToString()) }, + Translation = row.SourceSegment + } + ); + }, + false + ); using ( AsyncClientStreamingCall call = client.InsertPretranslations(cancellationToken: cancellationToken) ) { - foreach (ParallelCorpus corpus in request.Corpora) + foreach (InsertPretranslationsRequest request in pretranslationsRequests) { - var sourceFiles = corpus - .SourceCorpora.SelectMany(sc => - sc.Files.Where(f => - ( - sc.PretranslateAll - || sc.PretranslateTextIds is null - || sc.PretranslateTextIds.Contains(f.TextId) - ) - && f.Format == FileFormat.Text - ) - ) - .ToDictionary(f => f.TextId, f => f.Location); - var targetFiles = corpus - .TargetCorpora.SelectMany(tc => - tc.Files.Where(f => - ( - tc.PretranslateAll - || tc.PretranslateTextIds is null - || tc.PretranslateTextIds.Contains(f.TextId) - ) - && f.Format == FileFormat.Text - ) - ) - .ToDictionary(f => f.TextId, f => f.Location); - - foreach (KeyValuePair sourceFile in sourceFiles) - { - string[] sourceLines = await File.ReadAllLinesAsync( - sourceFile.Value, - cancellationToken - ); - - if (targetFiles.TryGetValue(sourceFile.Key, out string? targetPath)) - { - string[] targetLines = await File.ReadAllLinesAsync(targetPath, cancellationToken); - bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/'); - if (!isTabSeparated) - { - int lineNum = 1; - foreach ( - (string sourceLine, string targetLine) in sourceLines - .Select(l => l.Trim()) - .Zip(targetLines.Select(l => l.Trim())) - ) - { - if (sourceLine.Length > 0 && targetLine.Length == 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{lineNum}" }, - Translation = sourceLine - }, - cancellationToken - ); - } - lineNum++; - } - } - else - { - var sourceLinesDict = sourceLines.ToDictionary( - l => l.Split('\t')[0].Trim(), - l => l.Split('\t')[1].Trim() - ); - var targetLinesDict = targetLines.ToDictionary( - l => l.Split('\t')[0].Trim(), - l => l.Contains('\t') ? l.Split('\t')[1].Trim() : string.Empty - ); - foreach (KeyValuePair targetLineKVPair in targetLinesDict) - { - string? sourceLine = null; - sourceLinesDict.TryGetValue(targetLineKVPair.Key, out sourceLine); - sourceLine ??= string.Empty; - string? targetLine = targetLineKVPair.Value; - if (sourceLine.Length > 0 && targetLine.Length == 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{targetLineKVPair.Key}" }, - Translation = sourceLine - }, - cancellationToken - ); - } - } - } - } - else - { - bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/'); - if (!isTabSeparated) - { - int lineNum = 1; - foreach (string sourceLine in sourceLines.Select(l => l.Trim())) - { - if (sourceLine.Length > 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{lineNum}" }, - Translation = sourceLine - }, - cancellationToken - ); - } - lineNum++; - } - } - else - { - foreach (string sourceLine in sourceLines.Select(l => l.Trim())) - { - if (sourceLine.Length > 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{sourceLine.Split('\t')[0]}" }, - Translation = sourceLine.Contains('\t') - ? sourceLine.Split('\t')[1].Trim() - : string.Empty - }, - cancellationToken - ); - } - } - } - } - } + await call.RequestStream.WriteAsync(request, cancellationToken); } - await call.RequestStream.CompleteAsync(); await call; } @@ -325,4 +206,78 @@ ServerCallContext context new GetLanguageInfoResponse { InternalCode = request.Language + "_echo", IsNative = true, } ); } + + private static SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source) + { + return new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source.SourceCorpora.Select(Map).ToList(), + TargetCorpora = source.TargetCorpora.Select(Map).ToList() + }; + } + + private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source) + { + var trainOnChapters = source.TrainOnChapters.ToDictionary( + kvp => kvp.Key, + kvp => kvp.Value.Chapters.ToHashSet() + ); + var trainOnTextIds = source.TrainOnTextIds.ToHashSet(); + FilterChoice trainingFilter = GetFilterChoice(trainOnChapters, trainOnTextIds, source.TrainOnAll); + + var pretranslateChapters = source.PretranslateChapters.ToDictionary( + kvp => kvp.Key, + kvp => kvp.Value.Chapters.ToHashSet() + ); + var pretranslateTextIds = source.PretranslateTextIds.ToHashSet(); + FilterChoice pretranslateFilter = GetFilterChoice( + pretranslateChapters, + pretranslateTextIds, + source.PretranslateAll + ); + + return new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = source.Id, + Language = source.Language, + Files = source.Files.Select(Map).ToList(), + TrainOnChapters = trainingFilter == FilterChoice.Chapters ? trainOnChapters : null, + TrainOnTextIds = trainingFilter == FilterChoice.TextIds ? trainOnTextIds : null, + PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null, + PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null + }; + } + + private static SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source) + { + return new SIL.ServiceToolkit.Models.CorpusFile + { + Location = source.Location, + Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format, + TextId = source.TextId + }; + } + + private enum FilterChoice + { + Chapters, + TextIds, + None + } + + private static FilterChoice GetFilterChoice( + IReadOnlyDictionary> chapters, + HashSet textIds, + bool noFilter + ) + { + // Only either textIds or Scripture Range will be used at a time + // TextIds may be an empty array, so prefer that if both are empty (which applies to both scripture and text) + if (noFilter || (chapters is null && textIds is null)) + return FilterChoice.None; + if (chapters is null || chapters.Count == 0) + return FilterChoice.TextIds; + return FilterChoice.Chapters; + } } diff --git a/src/Echo/src/EchoTranslationEngine/Usings.cs b/src/Echo/src/EchoTranslationEngine/Usings.cs index b7f3ba2d..0404305b 100644 --- a/src/Echo/src/EchoTranslationEngine/Usings.cs +++ b/src/Echo/src/EchoTranslationEngine/Usings.cs @@ -5,3 +5,4 @@ global using Grpc.Core; global using Microsoft.Extensions.Diagnostics.HealthChecks; global using Serval.Translation.V1; +global using SIL.ServiceToolkit.Utils; diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs index 5a577cb5..c092be36 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs @@ -112,6 +112,12 @@ public static IMachineBuilder AddBuildJobOptions(this IMachineBuilder builder, I return builder; } + public static IMachineBuilder AddServiceToolkitServices(this IMachineBuilder builder) + { + builder.Services.AddParallelCorpusPreprocessor(); + return builder; + } + public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder) { if (builder.Configuration is null) diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs index 9ae176d8..be9fec42 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs @@ -15,7 +15,6 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf services.AddTransient(); services.AddScoped(); - services.AddSingleton(); services.AddStartupTask( (sp, cancellationToken) => sp.GetRequiredService().InitAsync(cancellationToken) @@ -42,6 +41,7 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf configuration.GetSection(DistributedReaderWriterLockOptions.Key) ); builder.AddBuildJobOptions(configuration.GetSection(BuildJobOptions.Key)); + builder.AddServiceToolkitServices(); builder.AddMessageOutboxOptions(configuration.GetSection(MessageOutboxOptions.Key)); } return builder; diff --git a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs index 3c46a34e..2e79d09a 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs @@ -7,8 +7,8 @@ public class NmtPreprocessBuildJob( ILogger logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - ICorpusService corpusService, - ILanguageTagService languageTagService + ILanguageTagService languageTagService, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService ) : PreprocessBuildJob( platformService, @@ -17,7 +17,7 @@ ILanguageTagService languageTagService logger, buildJobService, sharedFileService, - corpusService + parallelCorpusPreprocessingService ) { private readonly ILanguageTagService _languageTagService = languageTagService; diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index ecd52876..a471eb1c 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -1,49 +1,35 @@ namespace Serval.Machine.Shared.Services; -public class PreprocessBuildJob : HangfireBuildJob> +public class PreprocessBuildJob( + IPlatformService platformService, + IRepository engines, + IDataAccessContext dataAccessContext, + ILogger logger, + IBuildJobService buildJobService, + ISharedFileService sharedFileService, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService +) + : HangfireBuildJob>( + platformService, + engines, + dataAccessContext, + buildJobService, + logger + ) { private static readonly JsonWriterOptions PretranslateWriterOptions = new() { Indented = true }; internal BuildJobRunnerType TrainJobRunnerType { get; init; } = BuildJobRunnerType.ClearML; - private readonly ISharedFileService _sharedFileService; - private readonly ICorpusService _corpusService; - private int _seed = 1234; - private Random _random; + private readonly ISharedFileService _sharedFileService = sharedFileService; - public PreprocessBuildJob( - IPlatformService platformService, - IRepository engines, - IDataAccessContext dataAccessContext, - ILogger logger, - IBuildJobService buildJobService, - ISharedFileService sharedFileService, - ICorpusService corpusService - ) - : base(platformService, engines, dataAccessContext, buildJobService, logger) - { - _sharedFileService = sharedFileService; - _corpusService = corpusService; - _random = new Random(_seed); - } - - internal int Seed - { - get => _seed; - set - { - if (_seed != value) - { - _seed = value; - _random = new Random(_seed); - } - } - } + private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService = + parallelCorpusPreprocessingService; protected override async Task DoWorkAsync( string engineId, string buildId, - IReadOnlyList data, + IReadOnlyList data, string? buildOptions, CancellationToken cancellationToken ) @@ -121,159 +107,36 @@ CancellationToken cancellationToken int trainCount = 0; int pretranslateCount = 0; pretranslateWriter.WriteStartArray(); - foreach (ParallelCorpus corpus in corpora) - { - (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus - .SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) - .ToArray(); - ITextCorpus[] sourceTrainingCorpora = sourceCorpora - .Select(sc => - { - ITextCorpus textCorpus = sc.TextCorpus; - if (sc.Corpus.TrainOnTextIds is not null) - textCorpus = textCorpus.FilterTexts(sc.Corpus.TrainOnTextIds); - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || sc.Corpus.TrainOnChapters is null - || IsInChapters(sr, sc.Corpus.TrainOnChapters) - ); - }) - .ToArray(); - ITextCorpus[] sourcePretranslateCorpora = sourceCorpora - .Select(sc => - { - ITextCorpus textCorpus = sc.TextCorpus; - if (sc.Corpus.PretranslateTextIds is not null) - textCorpus = textCorpus.FilterTexts(sc.Corpus.PretranslateTextIds); - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || sc.Corpus.PretranslateChapters is null - || ( - IsInChapters(sr, sc.Corpus.PretranslateChapters) - && !IsInChapters(sr, sc.Corpus.TrainOnChapters ?? new()) - ) - ); - }) - .ToArray(); - - (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus - .TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) - .ToArray(); - ITextCorpus[] targetTrainingCorpora = targetCorpora - .Select(tc => - { - ITextCorpus textCorpus = tc.TextCorpus; - if (tc.Corpus.TrainOnTextIds is not null) - textCorpus = textCorpus.FilterTexts(tc.Corpus.TrainOnTextIds); - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || tc.Corpus.TrainOnChapters is null - || IsInChapters(sr, tc.Corpus.TrainOnChapters) - ); - }) - .ToArray(); - - if (sourceCorpora.Length == 0) - continue; - - int skipCount = 0; - foreach (Row?[] rows in AlignTrainCorpus(sourceTrainingCorpora, targetTrainingCorpora)) - { - if (skipCount > 0) - { - skipCount--; - continue; - } - - Row[] trainRows = rows.Where(r => r is not null).Cast().ToArray(); - if (trainRows.Length > 0) - { - Row row = trainRows[0]; - if (rows.Length > 1) - { - Row[] nonEmptyRows = trainRows.Where(r => r.SourceSegment.Length > 0).ToArray(); - Row[] targetNonEmptyRows = nonEmptyRows.Where(r => r.TargetSegment.Length > 0).ToArray(); - if (targetNonEmptyRows.Length > 0) - nonEmptyRows = targetNonEmptyRows; - if (nonEmptyRows.Length > 0) - { - nonEmptyRows = nonEmptyRows - .GroupBy(r => r.SourceSegment) - .Select(group => group.First()) - .ToArray(); - { - nonEmptyRows = nonEmptyRows - .GroupBy(r => r.SourceSegment) - .Select(group => group.First()) - .ToArray(); - row = nonEmptyRows[_random.Next(nonEmptyRows.Length)]; - } - } - } - - await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n"); - await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n"); - skipCount = row.RowCount - 1; - if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) - trainCount++; - } - } - - if ((bool?)buildOptionsObject?["use_key_terms"] ?? true) + _parallelCorpusPreprocessingService.Preprocess( + corpora, + row => { - ITextCorpus? sourceTermCorpus = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) - .FirstOrDefault(); - ITextCorpus? targetTermCorpus = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) - .FirstOrDefault(); - if (sourceTermCorpus is not null && targetTermCorpus is not null) - { - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); - foreach (ParallelTextRow row in parallelKeyTermsCorpus) - { - await sourceTrainWriter.WriteAsync($"{row.SourceText}\n"); - await targetTrainWriter.WriteAsync($"{row.TargetText}\n"); - trainCount++; - } - } - } - void WriteRow(Utf8JsonWriter writer, string textId, IReadOnlyList refs, string translation) + sourceTrainWriter.Write($"{row.SourceSegment}\n"); + targetTrainWriter.Write($"{row.TargetSegment}\n"); + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) + trainCount++; + }, + (row, corpus) => { - writer.WriteStartObject(); - writer.WriteString("corpusId", corpus.Id); - writer.WriteString("textId", textId); - writer.WriteStartArray("refs"); - foreach (object rowRef in refs) - writer.WriteStringValue(rowRef.ToString()); - writer.WriteEndArray(); - writer.WriteString("translation", translation); - writer.WriteEndObject(); + pretranslateWriter.WriteStartObject(); + pretranslateWriter.WriteString("corpusId", corpus.Id); + pretranslateWriter.WriteString("textId", row.TextId); + pretranslateWriter.WriteStartArray("refs"); + foreach (object rowRef in row.Refs) + pretranslateWriter.WriteStringValue(rowRef.ToString()); + pretranslateWriter.WriteEndArray(); + pretranslateWriter.WriteString("translation", row.SourceSegment); + pretranslateWriter.WriteEndObject(); pretranslateCount++; - } - - ITextCorpus targetCorpus = - targetCorpora.Length > 0 ? targetCorpora[0].TextCorpus : new DictionaryTextCorpus(); - - foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpus)) - { - if (row.SourceSegment.Length > 0) - WriteRow(pretranslateWriter, row.TextId, row.Refs, row.SourceSegment); - } - } + }, + (bool?)buildOptionsObject?["use_key_terms"] ?? true + ); pretranslateWriter.WriteEndArray(); return (trainCount, pretranslateCount); } - private static bool IsInChapters(ScriptureRef sr, Dictionary> selection) - { - return selection.TryGetValue(sr.Book, out HashSet? chapters) - && chapters != null - && (chapters.Count == 0 || chapters.Contains(sr.ChapterNum)); - } - protected override async Task CleanupAsync( string engineId, string buildId, @@ -294,189 +157,9 @@ JobCompletionStatus completionStatus } } - private static IEnumerable AlignTrainCorpus( - IReadOnlyList srcCorpora, - IReadOnlyList trgCorpora - ) - { - srcCorpora = srcCorpora.Select(sc => sc.Transform(CleanSegment)).ToArray(); - trgCorpora = trgCorpora.Select(tc => tc.Transform(CleanSegment)).ToArray(); - - if (trgCorpora.All(tc => tc.IsScripture())) - { - return srcCorpora - .SelectMany(sc => trgCorpora.Select(tc => AlignScripture(sc, tc))) - .ZipMany(rows => rows.ToArray()) - // filter out every list that only contains completely empty rows - .Where(rows => rows.Any(r => r is null || r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0)); - } - - IEnumerable sourceOnlyRows = srcCorpora - .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allSourceRows: true))) - .ZipMany(rows => - rows.Where(r => r.TargetSegment.Count == 0) - .Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1)) - .ToArray() - ); - - IEnumerable targetRows = srcCorpora - .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allTargetRows: true))) - .ZipMany(rows => - rows.Where(r => r.TargetSegment.Count > 0) - .Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1)) - .ToArray() - ); - - return sourceOnlyRows - .Concat(targetRows) - // filter out every list that only contains completely empty rows - .Where(rows => rows.Any(r => r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0)); - } - - private static IEnumerable AlignScripture(ITextCorpus srcCorpus, ITextCorpus trgCorpus) - { - int rowCount = 0; - StringBuilder srcSegBuffer = new(); - StringBuilder trgSegBuffer = new(); - HashSet vrefs = []; - foreach ( - (VerseRef vref, string srcSegment, string trgSegment) in srcCorpus - .ExtractScripture() - .Select(r => (r.CorpusVerseRef, r.Text)) - .Zip( - trgCorpus.ExtractScripture().Select(r => r.Text), - (s, t) => (VerseRef: s.CorpusVerseRef, SourceSegment: s.Text, TargetSegment: t) - ) - ) - { - if (srcSegment == "" && trgSegment == "") - { - vrefs.UnionWith(vref.AllVerses()); - rowCount++; - } - else if (srcSegment == "") - { - vrefs.UnionWith(vref.AllVerses()); - if (trgSegment.Length > 0) - { - if (trgSegBuffer.Length > 0) - trgSegBuffer.Append(' '); - trgSegBuffer.Append(trgSegment); - } - rowCount++; - } - else if (trgSegment == "") - { - vrefs.UnionWith(vref.AllVerses()); - if (srcSegment.Length > 0) - { - if (srcSegBuffer.Length > 0) - srcSegBuffer.Append(' '); - srcSegBuffer.Append(srcSegment); - } - rowCount++; - } - else - { - if (rowCount > 0) - { - yield return new( - vrefs.First().Book, - vrefs.Order().Select(v => new ScriptureRef(v)).Cast().ToArray(), - srcSegBuffer.ToString(), - trgSegBuffer.ToString(), - rowCount - ); - for (int i = 0; i < rowCount - 1; i++) - yield return null; - srcSegBuffer.Clear(); - trgSegBuffer.Clear(); - vrefs.Clear(); - rowCount = 0; - } - vrefs.UnionWith(vref.AllVerses()); - srcSegBuffer.Append(srcSegment); - trgSegBuffer.Append(trgSegment); - rowCount++; - } - } - - if (rowCount > 0) - { - yield return new( - vrefs.First().Book, - vrefs.Order().Select(v => new ScriptureRef(v)).Cast().ToArray(), - srcSegBuffer.ToString(), - trgSegBuffer.ToString(), - rowCount - ); - for (int i = 0; i < rowCount - 1; i++) - yield return null; - } - } - - private static IEnumerable AlignPretranslateCorpus(ITextCorpus[] srcCorpora, ITextCorpus trgCorpus) - { - int rowCount = 0; - StringBuilder srcSegBuffer = new(); - StringBuilder trgSegBuffer = new(); - List refs = []; - string textId = ""; - foreach (ParallelTextRow row in srcCorpora.SelectMany(sc => sc.AlignRows(trgCorpus, allSourceRows: true))) - { - if (!row.IsTargetRangeStart && row.IsTargetInRange) - { - refs.AddRange(row.TargetRefs); - if (row.SourceText.Length > 0) - { - if (srcSegBuffer.Length > 0) - srcSegBuffer.Append(' '); - srcSegBuffer.Append(row.SourceText); - } - rowCount++; - } - else - { - if (rowCount > 0) - { - yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); - textId = ""; - srcSegBuffer.Clear(); - trgSegBuffer.Clear(); - refs.Clear(); - rowCount = 0; - } - - textId = row.TextId; - refs.AddRange(row.TargetRefs); - srcSegBuffer.Append(row.SourceText); - trgSegBuffer.Append(row.TargetText); - rowCount++; - } - } - - if (rowCount > 0) - yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); - } - - private record Row( - string TextId, - IReadOnlyList Refs, - string SourceSegment, - string TargetSegment, - int RowCount - ); - protected virtual bool ResolveLanguageCodeForBaseModel(string languageCode, out string resolvedCode) { resolvedCode = languageCode; return true; } - - private static TextRow CleanSegment(TextRow row) - { - if (row.Text == "...") - row.Segment = []; - return row; - } } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs b/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs index dfc52263..336d98ae 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs @@ -91,7 +91,7 @@ await engineService.TrainSegmentPairAsync( public override async Task StartBuild(StartBuildRequest request, ServerCallContext context) { ITranslationEngineService engineService = GetEngineService(request.EngineType); - Models.ParallelCorpus[] corpora = request.Corpora.Select(Map).ToArray(); + SIL.ServiceToolkit.Models.ParallelCorpus[] corpora = request.Corpora.Select(Map).ToArray(); try { await engineService.StartBuildAsync( @@ -269,9 +269,9 @@ private static Translation.V1.Phrase Map(SIL.Machine.Translation.Phrase source) }; } - private static Models.ParallelCorpus Map(Translation.V1.ParallelCorpus source) + private static SIL.ServiceToolkit.Models.ParallelCorpus Map(Translation.V1.ParallelCorpus source) { - return new Models.ParallelCorpus + return new SIL.ServiceToolkit.Models.ParallelCorpus { Id = source.Id, SourceCorpora = source.SourceCorpora.Select(Map).ToList(), @@ -279,7 +279,7 @@ private static Models.ParallelCorpus Map(Translation.V1.ParallelCorpus source) }; } - private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus source) + private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus source) { var trainOnChapters = source.TrainOnChapters.ToDictionary( kvp => kvp.Key, @@ -299,7 +299,7 @@ private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus sou source.PretranslateAll ); - var corpus = new Models.MonolingualCorpus + return new SIL.ServiceToolkit.Models.MonolingualCorpus { Id = source.Id, Language = source.Language, @@ -309,15 +309,14 @@ private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus sou PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null, PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null }; - return corpus; } - private static Models.CorpusFile Map(Translation.V1.CorpusFile source) + private static SIL.ServiceToolkit.Models.CorpusFile Map(Translation.V1.CorpusFile source) { - return new Models.CorpusFile + return new SIL.ServiceToolkit.Models.CorpusFile { Location = source.Location, - Format = (Models.FileFormat)source.Format, + Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format, TextId = source.TextId }; } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs index b9393e9b..7e1627a6 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs @@ -7,9 +7,9 @@ public class SmtTransferPreprocessBuildJob( ILogger logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - ICorpusService corpusService, IDistributedReaderWriterLockFactory lockFactory, - IRepository trainSegmentPairs + IRepository trainSegmentPairs, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService ) : PreprocessBuildJob( platformService, @@ -18,7 +18,7 @@ IRepository trainSegmentPairs logger, buildJobService, sharedFileService, - corpusService + parallelCorpusPreprocessingService ) { private readonly IDistributedReaderWriterLockFactory _lockFactory = lockFactory; diff --git a/src/Machine/src/Serval.Machine.Shared/Usings.cs b/src/Machine/src/Serval.Machine.Shared/Usings.cs index ea49e89d..bb148b80 100644 --- a/src/Machine/src/Serval.Machine.Shared/Usings.cs +++ b/src/Machine/src/Serval.Machine.Shared/Usings.cs @@ -54,7 +54,7 @@ global using SIL.Machine.Translation; global using SIL.Machine.Translation.Thot; global using SIL.Machine.Utils; -global using SIL.Scripture; +global using SIL.ServiceToolkit.Models; global using SIL.ServiceToolkit.Services; global using SIL.ServiceToolkit.Utils; global using SIL.WritingSystems; diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs index 67145c01..1a694693 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs @@ -1,4 +1,6 @@ -namespace Serval.Machine.Shared.Services; +using SIL.ServiceToolkit.Utils; + +namespace Serval.Machine.Shared.Services; [TestFixture] public class NmtEngineServiceTests @@ -301,8 +303,8 @@ public override object ActivateJob(Type jobType) Substitute.For>(), _env.BuildJobService, _env.SharedFileService, - Substitute.For(), - new LanguageTagService() + new LanguageTagService(), + new ParallelCorpusPreprocessingService(new CorpusService()) ); } if (jobType == typeof(PostprocessBuildJob)) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 539b9c4c..27cd0707 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -1,3 +1,5 @@ +using SIL.ServiceToolkit.Utils; + namespace Serval.Machine.Shared.Services; [TestFixture] @@ -65,7 +67,7 @@ public async Task RunAsync_TrainAndPretranslateAll() await env.RunBuildJobAsync(corpus1); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(4)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2)); } [Test] @@ -76,7 +78,7 @@ public async Task RunAsync_PretranslateAll() await env.RunBuildJobAsync(corpus1); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(4)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2)); } [Test] @@ -87,7 +89,24 @@ public async Task RunAsync_PretranslateTextIds() await env.RunBuildJobAsync(corpus1); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(4)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2)); + } + + [Test] + public async Task RunAsync_PretranslateTextIdsOverlapWithTrainOnTextIds() + { + using TestEnvironment env = new(); + ParallelCorpus corpus1 = TestEnvironment.TextFileCorpus( + pretranslateTextIds: ["textId1"], + trainOnTextIds: ["textId1"] + ); + + await env.RunBuildJobAsync(corpus1); + Assert.Multiple(async () => + { + Assert.That((await env.GetTrainCountAsync()).Source1Count, Is.EqualTo(4)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(0)); + }); } [Test] @@ -143,7 +162,11 @@ public async Task RunAsync_PretranslateChapters() await env.RunBuildJobAsync(corpus1); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(4)); + Assert.That( + await env.GetPretranslateCountAsync(), + Is.EqualTo(4), + JsonSerializer.Serialize(await env.GetPretranslationsAsync()) + ); } [Test] @@ -189,7 +212,11 @@ public async Task RunAsync_MixedSource_Paratext() Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(56)); + Assert.That( + await env.GetPretranslateCountAsync(), + Is.EqualTo(14), + JsonSerializer.Serialize(await env.GetPretranslationsAsync()) + ); } [Test] @@ -208,7 +235,7 @@ public async Task RunAsync_MixedSource_Text() Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(9)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(3)); } [Test] @@ -388,6 +415,13 @@ public async Task ParallelCorpusLogic() new() { } } }, + PretranslateChapters = new() + { + { + "1CH", + new() { } + } + } }, }, TargetCorpora = new List() @@ -434,10 +468,12 @@ public async Task ParallelCorpusLogic() } }; await env.RunBuildJobAsync(corpora, useKeyTerms: false); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); Assert.Multiple(async () => { + string src = await env.GetSourceExtractAsync(); Assert.That( - await env.GetSourceExtractAsync(), + src, Is.EqualTo( @"Source one, chapter fourteen, verse fifty-five. Segment b. Source one, chapter fourteen, verse fifty-six. @@ -450,32 +486,35 @@ await env.GetSourceExtractAsync(), Source two, chapter one, verse nine. Source two, chapter one, verse ten. Source two, chapter one, verse one. " - ) + ), + src ); + string trg = await env.GetTargetExtractAsync(); Assert.That( - await env.GetTargetExtractAsync(), + trg, Is.EqualTo( @"Target two, chapter fourteen, verse fifty-five. Target two, chapter fourteen, verse fifty-six. -Target one, chapter one, verse one. -Target one, chapter one, verse two. +Target two, chapter one, verse one. +Target two, chapter one, verse two. Target one, chapter one, verse three. -Target one, chapter one, verse five and six. +Target two, chapter one, verse five and six. Target one, chapter one, verse seven and eight. -Target one, chapter one, verse nine and ten. +Target two, chapter one, verse nine and ten. " - ) + ), + trg + ); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(9), pretranslations.ToJsonString()); + Assert.That( + pretranslations[0]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one."), + pretranslations.ToJsonString() ); }); - JsonArray? pretranslations = await env.GetPretranslationsAsync(); - Assert.That(pretranslations, Is.Not.Null); - Assert.That(pretranslations!.Count, Is.EqualTo(37), pretranslations.ToJsonString()); - Assert.That( - pretranslations[2]!["translation"]!.ToString(), - Is.EqualTo("Source one, chapter twelve, verse one.") - ); } private class TestEnvironment : DisposableBase @@ -781,12 +820,9 @@ public PreprocessBuildJob GetBuildJob(TranslationEngineType engineType) Substitute.For>(), BuildJobService, SharedFileService, - CorpusService, - new LanguageTagService() - ) - { - Seed = 1234 - }; + new LanguageTagService(), + new ParallelCorpusPreprocessingService(CorpusService) + ); } case TranslationEngineType.SmtTransfer: { @@ -797,13 +833,10 @@ public PreprocessBuildJob GetBuildJob(TranslationEngineType engineType) Substitute.For>(), BuildJobService, SharedFileService, - CorpusService, LockFactory, - TrainSegmentPairs - ) - { - Seed = 1234 - }; + TrainSegmentPairs, + new ParallelCorpusPreprocessingService(CorpusService) + ); } default: throw new InvalidOperationException("Unknown engine type."); diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs index 6b888794..b1d989f0 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs @@ -1,4 +1,6 @@ -namespace Serval.Machine.Shared.Services; +using SIL.ServiceToolkit.Utils; + +namespace Serval.Machine.Shared.Services; [TestFixture] public class SmtTransferEngineServiceTests @@ -687,9 +689,9 @@ public override object ActivateJob(Type jobType) Substitute.For>(), _env.BuildJobService, _env.SharedFileService, - Substitute.For(), _env._lockFactory, - _env.TrainSegmentPairs + _env.TrainSegmentPairs, + new ParallelCorpusPreprocessingService(new CorpusService()) ) { TrainJobRunnerType = _env._trainJobRunnerType diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs index f58cb973..3ccb5537 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs @@ -28,4 +28,6 @@ global using SIL.Machine.Utils; global using SIL.ObjectModel; global using SIL.Scripture; +global using SIL.ServiceToolkit.Models; +global using SIL.ServiceToolkit.Services; global using SIL.WritingSystems; diff --git a/src/Serval/src/Serval.Shared/Serval.Shared.csproj b/src/Serval/src/Serval.Shared/Serval.Shared.csproj index 5af835f5..af12b208 100644 --- a/src/Serval/src/Serval.Shared/Serval.Shared.csproj +++ b/src/Serval/src/Serval.Shared/Serval.Shared.csproj @@ -19,7 +19,7 @@ - + diff --git a/src/Serval/src/Serval.Translation/Services/EngineService.cs b/src/Serval/src/Serval.Translation/Services/EngineService.cs index 5b3d08ff..9f16d939 100644 --- a/src/Serval/src/Serval.Translation/Services/EngineService.cs +++ b/src/Serval/src/Serval.Translation/Services/EngineService.cs @@ -681,12 +681,12 @@ pretranslateCorpus is null ); } } - return new V1.ParallelCorpus - { - Id = source.Id, - SourceCorpora = { sourceCorpus }, - TargetCorpora = { targetCorpus } - }; + V1.ParallelCorpus corpus = new() { Id = source.Id }; + if (sourceCorpus.Files.Count > 0) + corpus.SourceCorpora.Add(sourceCorpus); + if (targetCorpus.Files.Count > 0) + corpus.TargetCorpora.Add(targetCorpus); + return corpus; } private V1.ParallelCorpus Map( diff --git a/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs b/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs index d64fb15a..95c322da 100644 --- a/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs +++ b/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs @@ -177,12 +177,22 @@ public async Task AddTextCorpusToEngineAsync( bool pretranslate ) { - List sourceFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, sourceLanguage); + List sourceFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + sourceLanguage, + isTarget: false + ); var targetFileConfig = new List(); if (!pretranslate) { - List targetFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, targetLanguage); + List targetFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + targetLanguage, + isTarget: true + ); foreach (var item in targetFiles.Select((file, i) => new { i, file })) { targetFileConfig.Add( @@ -193,20 +203,11 @@ bool pretranslate var sourceFileConfig = new List(); - if (sourceLanguage == targetLanguage && !pretranslate) - { - // if it's the same language, and we are not pretranslating, do nothing (echo for suggestions) - // if pretranslating, we need to upload the source separately - // if different languages, we are not echoing. - } - else + for (int i = 0; i < sourceFiles.Count; i++) { - for (int i = 0; i < sourceFiles.Count; i++) - { - sourceFileConfig.Add( - new TranslationCorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] } - ); - } + sourceFileConfig.Add( + new TranslationCorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] } + ); } TranslationCorpus response = await TranslationEnginesClient.AddCorpusAsync( @@ -239,12 +240,22 @@ public async Task AddParallelTextCorpusToEngineAsync( bool pretranslate ) { - List sourceFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, sourceLanguage); + List sourceFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + sourceLanguage, + isTarget: false + ); var targetFileConfig = new List(); if (!pretranslate) { - List targetFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, targetLanguage); + List targetFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + targetLanguage, + isTarget: true + ); foreach (var item in targetFiles.Select((file, i) => new { i, file })) { targetFileConfig.Add(new CorpusFileConfig { FileId = item.file.Id, TextId = filesToAdd[item.i] }); @@ -263,18 +274,9 @@ bool pretranslate var sourceFileConfig = new List(); - if (sourceLanguage == targetLanguage && !pretranslate) - { - // if it's the same language, and we are not pretranslating, do nothing (echo for suggestions) - // if pretranslating, we need to upload the source separately - // if different languages, we are not echoing. - } - else + for (int i = 0; i < sourceFiles.Count; i++) { - for (int i = 0; i < sourceFiles.Count; i++) - { - sourceFileConfig.Add(new CorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] }); - } + sourceFileConfig.Add(new CorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] }); } CorpusConfig sourceCorpusConfig = @@ -305,7 +307,8 @@ bool pretranslate public async Task> UploadFilesAsync( IEnumerable filesToAdd, FileFormat fileFormat, - string language + string language, + bool isTarget ) { string languageFolder = Path.GetFullPath( @@ -325,7 +328,7 @@ string language foreach (string fileName in filesToAdd) { - string fullName = _prefix + language + "_" + fileName; + string fullName = _prefix + language + "_" + fileName + (isTarget ? "_trg" : "_src"); //delete files that have the name name if (filenameToId.Contains(fullName)) diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs index 83fd6a21..14e4ba2a 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs @@ -1,6 +1,4 @@ -using SIL.ServiceToolkit.Services; - -namespace Microsoft.Extensions.DependencyInjection; +namespace Microsoft.Extensions.DependencyInjection; public static class IHealthChecksBuilderExtensions { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs new file mode 100644 index 00000000..d5a6424f --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs @@ -0,0 +1,11 @@ +namespace Microsoft.Extensions.DependencyInjection; + +public static class IServiceCollectionExtensions +{ + public static IServiceCollection AddParallelCorpusPreprocessor(this IServiceCollection services) + { + services.AddSingleton(); + services.AddSingleton(); + return services; + } +} diff --git a/src/Machine/src/Serval.Machine.Shared/Models/CorpusFile.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/CorpusFile.cs similarity index 84% rename from src/Machine/src/Serval.Machine.Shared/Models/CorpusFile.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Models/CorpusFile.cs index a84bf7f6..65e45202 100644 --- a/src/Machine/src/Serval.Machine.Shared/Models/CorpusFile.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/CorpusFile.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Models; +namespace SIL.ServiceToolkit.Models; public enum FileFormat { diff --git a/src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs similarity index 92% rename from src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs index 2b4a1612..c0323727 100644 --- a/src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Models; +namespace SIL.ServiceToolkit.Models; public record MonolingualCorpus { diff --git a/src/Machine/src/Serval.Machine.Shared/Models/ParallelCorpus.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs similarity index 87% rename from src/Machine/src/Serval.Machine.Shared/Models/ParallelCorpus.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs index a28dfc14..83374162 100644 --- a/src/Machine/src/Serval.Machine.Shared/Models/ParallelCorpus.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Models; +namespace SIL.ServiceToolkit.Models; public record ParallelCorpus { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/Row.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/Row.cs new file mode 100644 index 00000000..5b43e1fe --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/Row.cs @@ -0,0 +1,3 @@ +namespace SIL.ServiceToolkit.Models; + +public record Row(string TextId, IReadOnlyList Refs, string SourceSegment, string TargetSegment, int RowCount); diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj index a84edf58..509c7683 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj @@ -16,6 +16,11 @@ + + + + + diff --git a/src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs similarity index 97% rename from src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 17d562ad..71d49a50 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Services; +namespace SIL.ServiceToolkit.Services; public class CorpusService : ICorpusService { diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs similarity index 81% rename from src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index bbcc9de3..babe8c9b 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Services; +namespace SIL.ServiceToolkit.Services; public interface ICorpusService { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs new file mode 100644 index 00000000..1556de6d --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs @@ -0,0 +1,11 @@ +namespace SIL.ServiceToolkit.Utils; + +public interface IParallelCorpusPreprocessingService +{ + void Preprocess( + IReadOnlyList corpora, + Action train, + Action pretranslate, + bool useKeyTerms = false + ); +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs new file mode 100644 index 00000000..df13a069 --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -0,0 +1,370 @@ +namespace SIL.ServiceToolkit.Utils; + +public class ParallelCorpusPreprocessingService : IParallelCorpusPreprocessingService +{ + private readonly ICorpusService _corpusService; + private int _seed = 1234; + private Random _random; + + public ParallelCorpusPreprocessingService(ICorpusService corpusService) + { + _corpusService = corpusService; + _random = new Random(_seed); + } + + internal int Seed + { + get => _seed; + set + { + if (_seed != value) + { + _seed = value; + _random = new Random(_seed); + } + } + } + + public void Preprocess( + IReadOnlyList corpora, + Action train, + Action pretranslate, + bool useKeyTerms = false + ) + { + foreach (ParallelCorpus corpus in corpora) + { + (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus + .SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) + .ToArray(); + ITextCorpus[] sourceTrainingCorpora = sourceCorpora + .Select(sc => + { + ITextCorpus textCorpus = sc.TextCorpus; + if (sc.Corpus.TrainOnTextIds is not null) + return textCorpus.FilterTexts(sc.Corpus.TrainOnTextIds); + return textCorpus.Where(row => + row.Ref is not ScriptureRef sr + || sc.Corpus.TrainOnChapters is null + || IsInChapters(sr, sc.Corpus.TrainOnChapters) + ); + }) + .ToArray(); + ITextCorpus[] sourcePretranslateCorpora = sourceCorpora + .Select(sc => + { + ITextCorpus textCorpus = sc.TextCorpus; + if (sc.Corpus.PretranslateTextIds is not null) + { + return textCorpus.FilterTexts( + sc.Corpus.PretranslateTextIds.Except(sc.Corpus.TrainOnTextIds ?? new()) + ); + } + return textCorpus.Where(row => + row.Ref is not ScriptureRef sr + || sc.Corpus.PretranslateChapters is null + || ( + IsInChapters(sr, sc.Corpus.PretranslateChapters) + && !IsInChapters(sr, sc.Corpus.TrainOnChapters ?? new()) + ) + ); + }) + .ToArray(); + + (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus + .TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) + .ToArray(); + ITextCorpus[] targetTrainingCorpora = targetCorpora + .Select(tc => + { + ITextCorpus textCorpus = tc.TextCorpus; + if (tc.Corpus.TrainOnTextIds is not null) + return textCorpus = textCorpus.FilterTexts(tc.Corpus.TrainOnTextIds); + return textCorpus.Where(row => + row.Ref is not ScriptureRef sr + || tc.Corpus.TrainOnChapters is null + || IsInChapters(sr, tc.Corpus.TrainOnChapters) + ); + }) + .ToArray(); + + if (sourceCorpora.Length == 0) + continue; + + int skipCount = 0; + foreach (Row?[] rows in AlignTrainCorpus(sourceTrainingCorpora, targetTrainingCorpora)) + { + if (skipCount > 0) + { + skipCount--; + continue; + } + + Row[] trainRows = rows.Where(r => r is not null).Cast().ToArray(); + if (trainRows.Length > 0) + { + Row row = trainRows[0]; + if (rows.Length > 1) + { + Row[] nonEmptyRows = trainRows.Where(r => r.SourceSegment.Length > 0).ToArray(); + Row[] targetNonEmptyRows = nonEmptyRows.Where(r => r.TargetSegment.Length > 0).ToArray(); + if (targetNonEmptyRows.Length > 0) + nonEmptyRows = targetNonEmptyRows; + if (nonEmptyRows.Length > 0) + { + row = nonEmptyRows[_random.Next(nonEmptyRows.Length)]; + } + } + skipCount = row.RowCount - 1; + train(row); + } + } + + if (useKeyTerms) + { + ITextCorpus? sourceTermCorpus = _corpusService + .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) + .FirstOrDefault(); + ITextCorpus? targetTermCorpus = _corpusService + .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) + .FirstOrDefault(); + if (sourceTermCorpus is not null && targetTermCorpus is not null) + { + IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); + foreach (ParallelTextRow row in parallelKeyTermsCorpus) + { + train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); + } + } + } + + foreach ( + Row row in AlignPretranslateCorpus( + sourcePretranslateCorpora, + targetCorpora.Length > 0 + ? targetCorpora.Select(tc => tc.TextCorpus).ToArray() + : [new DictionaryTextCorpus()] + ) + ) + { + pretranslate(row, corpus); + } + } + } + + private static bool IsInChapters(ScriptureRef sr, Dictionary> selection) + { + return selection.TryGetValue(sr.Book, out HashSet? chapters) + && chapters != null + && (chapters.Count == 0 || chapters.Contains(sr.ChapterNum)); + } + + private static IEnumerable AlignTrainCorpus( + IReadOnlyList srcCorpora, + IReadOnlyList trgCorpora + ) + { + srcCorpora = srcCorpora.Select(sc => sc.Transform(CleanSegment)).ToArray(); + trgCorpora = trgCorpora.Select(tc => tc.Transform(CleanSegment)).ToArray(); + + if (trgCorpora.All(tc => tc.IsScripture())) + { + return srcCorpora + .SelectMany(sc => trgCorpora.Select(tc => AlignScripture(sc, tc))) + .ZipMany(rows => rows.ToArray()) + // filter out every list that only contains completely empty rows + .Where(rows => rows.Any(r => r is null || r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0)); + } + + IEnumerable sourceOnlyRows = srcCorpora + .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allSourceRows: true))) + .ZipMany(rows => + rows.Where(r => r.TargetSegment.Count == 0) + .Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1)) + .ToArray() + ); + + IEnumerable targetRows = srcCorpora + .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allTargetRows: true))) + .ZipMany(rows => + rows.Where(r => r.TargetSegment.Count > 0) + .Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1)) + .ToArray() + ); + + return sourceOnlyRows + .Concat(targetRows) + // filter out every list that only contains completely empty rows + .Where(rows => rows.Any(r => r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0)); + } + + private static IEnumerable AlignScripture(ITextCorpus srcCorpus, ITextCorpus trgCorpus) + { + int rowCount = 0; + StringBuilder srcSegBuffer = new(); + StringBuilder trgSegBuffer = new(); + HashSet vrefs = []; + foreach ( + (VerseRef vref, string srcSegment, string trgSegment) in srcCorpus + .ExtractScripture() + .Select(r => (r.CorpusVerseRef, r.Text)) + .Zip( + trgCorpus.ExtractScripture().Select(r => r.Text), + (s, t) => (VerseRef: s.CorpusVerseRef, SourceSegment: s.Text, TargetSegment: t) + ) + ) + { + if (srcSegment == "" && trgSegment == "") + { + vrefs.UnionWith(vref.AllVerses()); + rowCount++; + } + else if (srcSegment == "") + { + vrefs.UnionWith(vref.AllVerses()); + if (trgSegment.Length > 0) + { + if (trgSegBuffer.Length > 0) + trgSegBuffer.Append(' '); + trgSegBuffer.Append(trgSegment); + } + rowCount++; + } + else if (trgSegment == "") + { + vrefs.UnionWith(vref.AllVerses()); + if (srcSegment.Length > 0) + { + if (srcSegBuffer.Length > 0) + srcSegBuffer.Append(' '); + srcSegBuffer.Append(srcSegment); + } + rowCount++; + } + else + { + if (rowCount > 0) + { + yield return new( + vrefs.First().Book, + vrefs.Order().Select(v => new ScriptureRef(v)).Cast().ToArray(), + srcSegBuffer.ToString(), + trgSegBuffer.ToString(), + rowCount + ); + for (int i = 0; i < rowCount - 1; i++) + yield return null; + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + vrefs.Clear(); + rowCount = 0; + } + vrefs.UnionWith(vref.AllVerses()); + srcSegBuffer.Append(srcSegment); + trgSegBuffer.Append(trgSegment); + rowCount++; + } + } + + if (rowCount > 0) + { + yield return new( + vrefs.First().Book, + vrefs.Order().Select(v => new ScriptureRef(v)).Cast().ToArray(), + srcSegBuffer.ToString(), + trgSegBuffer.ToString(), + rowCount + ); + for (int i = 0; i < rowCount - 1; i++) + yield return null; + } + } + + private IEnumerable AlignPretranslateCorpus(ITextCorpus[] srcCorpora, ITextCorpus[] trgCorpora) + { + if (srcCorpora.All(sc => sc.IsScripture())) + { + int skipCount = 0; + var corpora = srcCorpora + .SelectMany(sc => trgCorpora.Select(tc => AlignScripture(sc, tc))) + .ZipMany(rows => rows.ToArray()) + .Where(rows => rows.All(r => r is null || r.TargetSegment.Length == 0)) + .Select(rows => rows.Where(r => r is not null && r.SourceSegment.Length > 0).Select(r => r!).ToArray()) + .Where(rows => rows.Length > 0); + foreach (Row[] rows in corpora) + { + if (skipCount > 0) + { + skipCount--; + continue; + } + Row row = rows.First(); + if (rows.Length > 1) + { + row = rows[_random.Next(rows.Length)]; + } + if (rows.Select(r => r.Refs.Count).Distinct().Count() > 1) + skipCount = row.RowCount - 1; + yield return row; + } + } + else + { + int rowCount = 0; + StringBuilder srcSegBuffer = new(); + StringBuilder trgSegBuffer = new(); + List refs = []; + string textId = ""; + foreach ( + ParallelTextRow row in srcCorpora + .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allSourceRows: true))) + .ZipMany(rows => rows.ToArray()) + .Where(rows => rows.All(r => r.TargetSegment.Count == 0)) + .Select(rows => rows.Where(r => r.SourceSegment.Count > 0).FirstOrDefault()) + .Where(r => r is not null) + .Select(r => r!) + ) + { + if (!row.IsTargetRangeStart && row.IsTargetInRange) + { + refs.AddRange(row.TargetRefs); + if (row.SourceText.Length > 0) + { + if (srcSegBuffer.Length > 0) + srcSegBuffer.Append(' '); + srcSegBuffer.Append(row.SourceText); + } + rowCount++; + } + else + { + if (rowCount > 0) + { + yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + textId = ""; + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + refs.Clear(); + rowCount = 0; + } + + textId = row.TextId; + refs.AddRange(row.TargetRefs); + srcSegBuffer.Append(row.SourceText); + trgSegBuffer.Append(row.TargetText); + rowCount++; + } + } + + if (rowCount > 0) + yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + } + } + + private static TextRow CleanSegment(TextRow row) + { + if (row.Text == "...") + row.Segment = []; + return row; + } +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs index 0d9630d6..bcf193f4 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs @@ -1,4 +1,5 @@ global using System.Diagnostics.CodeAnalysis; +global using System.Text; global using System.Text.Json.Nodes; global using System.Text.RegularExpressions; global using Grpc.Core; @@ -9,4 +10,10 @@ global using Microsoft.Extensions.Hosting; global using Microsoft.Extensions.Logging; global using Microsoft.Extensions.Options; +global using SIL.Machine.Corpora; +global using SIL.Machine.Utils; +global using SIL.Scripture; +global using SIL.ServiceToolkit.Models; +global using SIL.ServiceToolkit.Services; +global using SIL.ServiceToolkit.Utils; global using SIL.WritingSystems; diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/SIL.ServiceToolkit.Tests.csproj b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/SIL.ServiceToolkit.Tests.csproj new file mode 100644 index 00000000..4826338e --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/SIL.ServiceToolkit.Tests.csproj @@ -0,0 +1,32 @@ + + + + net8.0 + enable + enable + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessorTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessorTests.cs new file mode 100644 index 00000000..75c19878 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessorTests.cs @@ -0,0 +1,98 @@ +using SIL.ServiceToolkit.Services; + +namespace SIL.ServiceToolkit.Utils; + +[TestFixture] +public class ParallelCorpusPreprocessorTests +{ + private static readonly string TestDataPath = Path.Combine( + AppContext.BaseDirectory, + "..", + "..", + "..", + "Utils", + "data" + ); + + [Test] + public void TestParallelCorpusPreprocessor() + { + var processor = new ParallelCorpusPreprocessingService(new CorpusService()); + List corpora = + new() + { + new() + { + Id = "corpus1", + SourceCorpora = new List + { + new MonolingualCorpus() + { + Id = "source-corpus1", + Language = "en", + Files = new List + { + new() + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "source1.txt") + } + } + }, + new MonolingualCorpus() + { + Id = "source-corpus2", + Language = "en", + Files = new List + { + new() + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "source2.txt") + } + } + } + }, + TargetCorpora = new List + { + new MonolingualCorpus() + { + Id = "target-corpus1", + Language = "en", + Files = new List + { + new() + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "target1.txt") + } + } + } + } + } + }; + int trainCount = 0; + int pretranslateCount = 0; + processor.Preprocess( + corpora, + row => + { + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) + trainCount++; + }, + (row, corpus) => + { + pretranslateCount++; + }, + false + ); + Assert.Multiple(() => + { + Assert.That(trainCount, Is.EqualTo(2)); + Assert.That(pretranslateCount, Is.EqualTo(3)); + }); + } +} diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source1.txt b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source1.txt new file mode 100644 index 00000000..2aeb971c --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source1.txt @@ -0,0 +1,7 @@ +Source one, Line 1 +Source one, Line 2 + +Source one, Line 4 + +Source one, Line 6 + diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source2.txt b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source2.txt new file mode 100644 index 00000000..7f4a0669 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source2.txt @@ -0,0 +1,7 @@ +Source two, Line 1 +Source two, Line 2 + +Source two, Line 4 +Source two, Line 5 +Source two, Line 6 + diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/target1.txt b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/target1.txt new file mode 100644 index 00000000..816e9435 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/target1.txt @@ -0,0 +1,7 @@ +Target one, Line 1 + + +Target one, Line 4 + + +Target one, Line 7 diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs new file mode 100644 index 00000000..e1c24c5f --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs @@ -0,0 +1,2 @@ +global using NUnit.Framework; +global using SIL.ServiceToolkit.Models;