From 46b128eb8a041b645dea77eabd3db896cd8b3822 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 31 Oct 2024 10:37:44 -0400 Subject: [PATCH 1/3] Fix up USFM pretranslations for Parallel corpus --- .../Services/PretranslationService.cs | 21 ++- .../test/Serval.E2ETests/ServalApiTests.cs | 8 +- .../Services/PretranslationServiceTests.cs | 170 +++++++++++++----- 3 files changed, 153 insertions(+), 46 deletions(-) diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 48e89b91..1bf552fb 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -41,11 +41,24 @@ public async Task GetUsfmAsync( { Engine? engine = await _engines.GetAsync(engineId, cancellationToken); Corpus? corpus = engine?.Corpora.SingleOrDefault(c => c.Id == corpusId); - if (corpus is null) - throw new EntityNotFoundException($"Could not find the Corpus '{corpusId}' in Engine '{engineId}'."); + ParallelCorpus? parallelCorpus = engine?.ParallelCorpora.SingleOrDefault(c => c.Id == corpusId); - CorpusFile sourceFile = corpus.SourceFiles[0]; - CorpusFile targetFile = corpus.TargetFiles[0]; + CorpusFile sourceFile; + CorpusFile targetFile; + if (corpus is not null) + { + sourceFile = corpus.SourceFiles[0]; + targetFile = corpus.TargetFiles[0]; + } + else if (parallelCorpus is not null) + { + sourceFile = parallelCorpus.SourceCorpora[0].Files[0]; + targetFile = parallelCorpus.TargetCorpora[0].Files[0]; + } + else + { + throw new EntityNotFoundException($"Could not find the Corpus '{corpusId}' in Engine '{engineId}'."); + } if (sourceFile.Format is not FileFormat.Paratext || targetFile.Format is not FileFormat.Paratext) throw new InvalidOperationException("USFM format is not valid for non-Scripture corpora."); diff --git a/src/Serval/test/Serval.E2ETests/ServalApiTests.cs b/src/Serval/test/Serval.E2ETests/ServalApiTests.cs index 3e31be71..d4899775 100644 --- a/src/Serval/test/Serval.E2ETests/ServalApiTests.cs +++ b/src/Serval/test/Serval.E2ETests/ServalApiTests.cs @@ -221,7 +221,7 @@ public async Task NmtLargeBatchAndDownload() engineId, cId ); - TestContext.WriteLine(lTrans[0].Translation); + Assert.That(lTrans, Has.Count.EqualTo(14)); // Download the model from the s3 bucket ModelDownloadUrl url = await _helperClient.TranslationEnginesClient.GetModelDownloadUrlAsync(engineId); using Task s = new HttpClient().GetStreamAsync(url.Url); @@ -436,6 +436,12 @@ public async Task ParatextProjectNmtJobAsync() corpus.Id ); Assert.That(lTrans, Is.Not.Empty); + string usfm = await _helperClient.TranslationEnginesClient.GetPretranslatedUsfmAsync( + engineId, + corpus.Id, + "JHN" + ); + Assert.That(usfm, Does.Contain("\\v 1")); } [TearDown] diff --git a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs index cbdcb6ff..5aca4ed6 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs @@ -22,7 +22,7 @@ public class PretranslationServiceTests [Test] public async Task GetUsfmAsync_Source_PreferExisting() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferExisting, @@ -46,7 +46,7 @@ public async Task GetUsfmAsync_Source_PreferExisting() [Test] public async Task GetUsfmAsync_Source_PreferPretranslated() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferPretranslated, @@ -70,7 +70,7 @@ public async Task GetUsfmAsync_Source_PreferPretranslated() [Test] public async Task GetUsfmAsync_Source_OnlyExisting() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.OnlyExisting, @@ -94,7 +94,7 @@ public async Task GetUsfmAsync_Source_OnlyExisting() [Test] public async Task GetUsfmAsync_Source_OnlyPretranslated() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.OnlyPretranslated, @@ -118,7 +118,7 @@ public async Task GetUsfmAsync_Source_OnlyPretranslated() [Test] public async Task GetUsfmAsync_Target_PreferExisting() { - TestEnvironment env = new(); + using TestEnvironment env = new(); env.AddMatthewToTarget(); string usfm = await env.GetUsfmAsync( @@ -143,7 +143,7 @@ public async Task GetUsfmAsync_Target_PreferExisting() [Test] public async Task GetUsfmAsync_Target_PreferPretranslated() { - TestEnvironment env = new(); + using TestEnvironment env = new(); env.AddMatthewToTarget(); string usfm = await env.GetUsfmAsync( @@ -168,7 +168,7 @@ public async Task GetUsfmAsync_Target_PreferPretranslated() [Test] public async Task GetUsfmAsync_Target_TargetBookDoesNotExist() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferPretranslated, @@ -181,7 +181,7 @@ public async Task GetUsfmAsync_Target_TargetBookDoesNotExist() [Test] public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferPretranslated, @@ -205,7 +205,7 @@ public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist() [Test] public async Task GetUsfmAsync_Auto_TargetBookExists() { - TestEnvironment env = new(); + using TestEnvironment env = new(); env.AddMatthewToTarget(); string usfm = await env.GetUsfmAsync( @@ -230,7 +230,7 @@ public async Task GetUsfmAsync_Auto_TargetBookExists() [Test] public async Task GetUsfmAsync_Target_OnlyExisting() { - TestEnvironment env = new(); + using TestEnvironment env = new(); env.AddMatthewToTarget(); string usfm = await env.GetUsfmAsync( @@ -244,7 +244,7 @@ public async Task GetUsfmAsync_Target_OnlyExisting() [Test] public async Task GetUsfmAsync_Target_OnlyPretranslated() { - TestEnvironment env = new(); + using TestEnvironment env = new(); env.AddMatthewToTarget(); string usfm = await env.GetUsfmAsync( @@ -266,10 +266,26 @@ public async Task GetUsfmAsync_Target_OnlyPretranslated() ); } - private class TestEnvironment + private class TestEnvironment : IDisposable { public TestEnvironment() { + CorpusFile file1 = + new() + { + Id = "file1", + Filename = "file1.zip", + Format = Shared.Contracts.FileFormat.Paratext, + TextId = "project1" + }; + CorpusFile file2 = + new() + { + Id = "file2", + Filename = "file2.zip", + Format = Shared.Contracts.FileFormat.Paratext, + TextId = "project1" + }; Engines = new MemoryRepository( [ new() @@ -287,29 +303,45 @@ public TestEnvironment() Id = "corpus1", SourceLanguage = "en", TargetLanguage = "en", - SourceFiles = - [ + SourceFiles = [file1], + TargetFiles = [file2], + } + ] + }, + new() + { + Id = "parallel_engine1", + Owner = "owner1", + SourceLanguage = "en", + TargetLanguage = "en", + Type = "nmt", + ModelRevision = 1, + ParallelCorpora = + [ + new() + { + Id = "parallel_corpus1", + SourceCorpora = new List() + { new() { - Id = "file1", - Filename = "file1.zip", - Format = Shared.Contracts.FileFormat.Paratext, - TextId = "project1" + Id = "src_1", + Language = "en", + Files = [file1], } - ], - TargetFiles = - [ + }, + TargetCorpora = new List() + { new() { - Id = "file2", - Filename = "file2.zip", - Format = Shared.Contracts.FileFormat.Paratext, - TextId = "project1" + Id = "trg_1", + Language = "es", + Files = [file2], } - ], + } } ] - } + }, ] ); @@ -334,6 +366,26 @@ public TestEnvironment() TextId = "MAT", Refs = ["MAT 1:2"], Translation = "Chapter 1, verse 2." + }, + new() + { + Id = "pt3", + EngineRef = "parallel_engine1", + ModelRevision = 1, + CorpusRef = "parallel_corpus1", + TextId = "MAT", + Refs = ["MAT 1:1"], + Translation = "Chapter 1, verse 1." + }, + new() + { + Id = "pt4", + EngineRef = "parallel_engine1", + ModelRevision = 1, + CorpusRef = "parallel_corpus1", + TextId = "MAT", + Refs = ["MAT 1:2"], + Translation = "Chapter 1, verse 2." } ] ); @@ -342,23 +394,37 @@ public TestEnvironment() ScriptureDataFileService.GetParatextProjectSettings("file2.zip").Returns(CreateProjectSettings("TRG")); var zipSubstituteSource = Substitute.For(); var zipSubstituteTarget = Substitute.For(); - zipSubstituteSource.OpenEntry("MATSRC.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes(SourceUsfm))); - zipSubstituteTarget.OpenEntry("MATTRG.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes(""))); + zipSubstituteSource + .OpenEntry("MATSRC.SFM") + .Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(SourceUsfm))); + zipSubstituteTarget.OpenEntry("MATTRG.SFM").Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(""))); zipSubstituteSource.EntryExists(Arg.Any()).Returns(false); zipSubstituteTarget.EntryExists(Arg.Any()).Returns(false); zipSubstituteSource.EntryExists("MATSRC.SFM").Returns(true); zipSubstituteTarget.EntryExists("MATTRG.SFM").Returns(true); TargetZipContainer = zipSubstituteTarget; - using var textUpdaterSource = new Shared.Services.ZipParatextProjectTextUpdater( - zipSubstituteSource, - CreateProjectSettings("SRC") - ); - using var textUpdaterTarget = new Shared.Services.ZipParatextProjectTextUpdater( - zipSubstituteTarget, - CreateProjectSettings("TRG") - ); - ScriptureDataFileService.GetZipParatextProjectTextUpdater("file1.zip").Returns(textUpdaterSource); - ScriptureDataFileService.GetZipParatextProjectTextUpdater("file2.zip").Returns(textUpdaterTarget); + TextUpdaters = new List(); + Shared.Services.ZipParatextProjectTextUpdater GetTextUpdater(string type) + { + var updater = type switch + { + "SRC" + => new Shared.Services.ZipParatextProjectTextUpdater( + zipSubstituteSource, + CreateProjectSettings("SRC") + ), + "TRG" + => new Shared.Services.ZipParatextProjectTextUpdater( + zipSubstituteTarget, + CreateProjectSettings("TRG") + ), + _ => throw new ArgumentException() + }; + TextUpdaters.Add(updater); + return updater; + } + ScriptureDataFileService.GetZipParatextProjectTextUpdater("file1.zip").Returns(x => GetTextUpdater("SRC")); + ScriptureDataFileService.GetZipParatextProjectTextUpdater("file2.zip").Returns(x => GetTextUpdater("TRG")); Service = new PretranslationService(Pretranslations, Engines, ScriptureDataFileService); } @@ -367,6 +433,7 @@ public TestEnvironment() public MemoryRepository Engines { get; } public IScriptureDataFileService ScriptureDataFileService { get; } public IZipContainer TargetZipContainer { get; } + public IList TextUpdaters { get; } public async Task GetUsfmAsync( PretranslationUsfmTextOrigin textOrigin, @@ -381,12 +448,25 @@ PretranslationUsfmTemplate template textOrigin: textOrigin, template: template ); - return usfm.Replace("\r\n", "\n"); + usfm = usfm.Replace("\r\n", "\n"); + string parallel_usfm = await Service.GetUsfmAsync( + engineId: "parallel_engine1", + modelRevision: 1, + corpusId: "parallel_corpus1", + textId: "MAT", + textOrigin: textOrigin, + template: template + ); + parallel_usfm = parallel_usfm.Replace("\r\n", "\n"); + Assert.That(parallel_usfm, Is.EqualTo(usfm)); + return usfm; } public void AddMatthewToTarget() { - TargetZipContainer.OpenEntry("MATTRG.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes(TargetUsfm))); + TargetZipContainer + .OpenEntry("MATTRG.SFM") + .Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(TargetUsfm))); } private static ParatextProjectSettings CreateProjectSettings(string name) @@ -406,5 +486,13 @@ private static ParatextProjectSettings CreateProjectSettings(string name) languageCode: "en" ); } + + public void Dispose() + { + foreach (var updater in TextUpdaters) + { + updater.Dispose(); + } + } } } From fa6a04d4d7ecf2782432ea92672ac04223ad00fd Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 31 Oct 2024 15:39:00 -0400 Subject: [PATCH 2/3] Make 'use first source' consistent across preprocessing & add check --- .../Services/PreprocessBuildJob.cs | 29 ++++-- .../Services/PreprocessBuildJobTests.cs | 14 ++- .../TranslationEnginesController.cs | 18 ++++ .../TranslationEngineTests.cs | 88 ++++++++++++++----- 4 files changed, 114 insertions(+), 35 deletions(-) diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index 7c5e9575..082cdeff 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -139,12 +139,16 @@ row.Ref is not ScriptureRef sr ); }) .ToArray(); - ITextCorpus[] sourcePretranslateCorpora = sourceCorpora + ITextCorpus? sourcePretranslateCorpus = sourceCorpora .Select(sc => { ITextCorpus textCorpus = sc.TextCorpus; if (sc.Corpus.PretranslateTextIds is not null) - textCorpus = textCorpus.FilterTexts(sc.Corpus.PretranslateTextIds); + { + textCorpus = textCorpus.FilterTexts( + sc.Corpus.PretranslateTextIds.Except(sc.Corpus.TrainOnTextIds ?? new()) + ); + } return textCorpus.Where(row => row.Ref is not ScriptureRef sr || sc.Corpus.PretranslateChapters is null @@ -154,7 +158,8 @@ row.Ref is not ScriptureRef sr ) ); }) - .ToArray(); + .ToArray() + .FirstOrDefault(); (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus .TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) @@ -254,11 +259,13 @@ void WriteRow(Utf8JsonWriter writer, string textId, IReadOnlyList refs, ITextCorpus targetCorpus = targetCorpora.Length > 0 ? targetCorpora[0].TextCorpus : new DictionaryTextCorpus(); - - foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpus)) + if (sourcePretranslateCorpus != null) { - if (row.SourceSegment.Length > 0) - WriteRow(pretranslateWriter, row.TextId, row.Refs, row.SourceSegment); + foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpus, targetCorpus)) + { + if (row.SourceSegment.Length > 0 && (row.TargetSegment.Length == 0 || !targetCorpus.Any())) + WriteRow(pretranslateWriter, row.TextId, row.Refs, row.SourceSegment); + } } } @@ -415,14 +422,18 @@ IReadOnlyList trgCorpora } } - private static IEnumerable AlignPretranslateCorpus(ITextCorpus[] srcCorpora, ITextCorpus trgCorpus) + private static IEnumerable AlignPretranslateCorpus(ITextCorpus srcCorpus, ITextCorpus trgCorpus) { int rowCount = 0; StringBuilder srcSegBuffer = new(); StringBuilder trgSegBuffer = new(); List refs = []; string textId = ""; - foreach (ParallelTextRow row in srcCorpora.SelectMany(sc => sc.AlignRows(trgCorpus, allSourceRows: true))) + + srcCorpus = srcCorpus.Transform(CleanSegment); + trgCorpus = trgCorpus.Transform(CleanSegment); + + foreach (ParallelTextRow row in srcCorpus.AlignRows(trgCorpus, allSourceRows: true)) { if (!row.IsTargetRangeStart && row.IsTargetInRange) { diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index a4d8eef1..da976acf 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -191,7 +191,11 @@ public async Task RunAsync_MixedSource_Paratext() Assert.That(termCount, Is.EqualTo(0)); }); // FIXME - this should be 56 (or double check) - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(30)); + Assert.That( + await env.GetPretranslateCountAsync(), + Is.EqualTo(13), + (await env.GetPretranslationsAsync())?.ToJsonString() + ); } [Test] @@ -211,7 +215,11 @@ public async Task RunAsync_MixedSource_Text() Assert.That(termCount, Is.EqualTo(0)); }); // FIXME this should be 9. - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(5)); + Assert.That( + await env.GetPretranslateCountAsync(), + Is.EqualTo(2), + (await env.GetPretranslationsAsync())?.ToJsonString() + ); } [Test] @@ -475,7 +483,7 @@ await env.GetTargetExtractAsync(), JsonArray? pretranslations = await env.GetPretranslationsAsync(); Assert.That(pretranslations, Is.Not.Null); // FIXME this should be 37. - Assert.That(pretranslations!.Count, Is.EqualTo(24), pretranslations.ToJsonString()); + Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); Assert.That( pretranslations[2]!["translation"]!.ToString(), Is.EqualTo("Source one, chapter twelve, verse one.") diff --git a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs index 4871b06b..8fb394ae 100644 --- a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs +++ b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs @@ -1372,6 +1372,24 @@ private static Build Map(Engine engine, TranslationBuildConfigDto source) $"The parallel corpus {pcc.ParallelCorpusId} is not valid: This parallel corpus does not exist for engine {engine.Id}." ); } + if ( + pcc.SourceFilters != null + && pcc.SourceFilters.Count > 0 + && ( + pcc.SourceFilters.Select(sf => sf.CorpusId).Distinct().Count() > 1 + || pcc.SourceFilters[0].CorpusId + != engine + .ParallelCorpora.Where(pc => pc.Id == pcc.ParallelCorpusId) + .First() + .SourceCorpora[0] + .Id + ) + ) + { + throw new InvalidOperationException( + $"Only the first source corpus in a parallel corpus may be filtered for pretranslation." + ); + } pretranslateCorpora.Add( new PretranslateCorpus { diff --git a/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs b/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs index cdf1bcf3..d5bb79f3 100644 --- a/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs +++ b/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs @@ -28,7 +28,15 @@ public class TranslationEngineTests new() { Name = "TestCorpus", - SourceCorpusIds = [SOURCE_CORPUS_ID], + SourceCorpusIds = [SOURCE_CORPUS_ID_1], + TargetCorpusIds = [TARGET_CORPUS_ID], + }; + + private static readonly TranslationParallelCorpusConfig TestMixedParallelCorpusConfig = + new() + { + Name = "TestCorpus", + SourceCorpusIds = [SOURCE_CORPUS_ID_1, SOURCE_CORPUS_ID_2], TargetCorpusIds = [TARGET_CORPUS_ID], }; private static readonly TranslationCorpusConfig TestCorpusConfigNonEcho = @@ -70,8 +78,9 @@ public class TranslationEngineTests private const string FILE3_FILENAME = "file_c"; private const string FILE4_ID = "f00000000000000000000004"; private const string FILE4_FILENAME = "file_d"; - private const string SOURCE_CORPUS_ID = "cc0000000000000000000001"; - private const string TARGET_CORPUS_ID = "cc0000000000000000000002"; + private const string SOURCE_CORPUS_ID_1 = "cc0000000000000000000001"; + private const string SOURCE_CORPUS_ID_2 = "cc0000000000000000000002"; + private const string TARGET_CORPUS_ID = "cc0000000000000000000003"; private const string DOES_NOT_EXIST_ENGINE_ID = "e00000000000000000000004"; private const string DOES_NOT_EXIST_CORPUS_ID = "c00000000000000000000001"; @@ -170,7 +179,14 @@ public async Task SetUp() var srcCorpus = new DataFiles.Models.Corpus { - Id = SOURCE_CORPUS_ID, + Id = SOURCE_CORPUS_ID_1, + Language = "en", + Owner = "client1", + Files = [new() { File = srcFile, TextId = "all" }] + }; + var srcCorpus2 = new DataFiles.Models.Corpus + { + Id = SOURCE_CORPUS_ID_2, Language = "en", Owner = "client1", Files = [new() { File = srcFile, TextId = "all" }] @@ -182,7 +198,7 @@ public async Task SetUp() Owner = "client1", Files = [new() { File = trgFile, TextId = "all" }] }; - await _env.Corpora.InsertAllAsync([srcCorpus, trgCorpus]); + await _env.Corpora.InsertAllAsync([srcCorpus, srcCorpus2, trgCorpus]); } [Test] @@ -813,7 +829,7 @@ public async Task AddParallelCorpusToEngineByIdAsync() ); Assert.Multiple(() => { - Assert.That(result.SourceCorpora.First().Id, Is.EqualTo(SOURCE_CORPUS_ID)); + Assert.That(result.SourceCorpora.First().Id, Is.EqualTo(SOURCE_CORPUS_ID_1)); Assert.That(result.TargetCorpora.First().Id, Is.EqualTo(TARGET_CORPUS_ID)); }); Engine? engine = await _env.Engines.GetAsync(ECHO_ENGINE1_ID); @@ -861,7 +877,7 @@ public async Task UpdateParallelCorpusByIdForEngineByIdAsync() ); var updateConfig = new TranslationParallelCorpusUpdateConfig { - SourceCorpusIds = [SOURCE_CORPUS_ID], + SourceCorpusIds = [SOURCE_CORPUS_ID_1], TargetCorpusIds = [TARGET_CORPUS_ID] }; await client.UpdateParallelCorpusAsync(ECHO_ENGINE1_ID, result.Id, updateConfig); @@ -883,7 +899,7 @@ public void UpdateParallelCorpusByIdForEngineById_NoSuchCorpus() { var updateConfig = new TranslationParallelCorpusUpdateConfig { - SourceCorpusIds = [SOURCE_CORPUS_ID], + SourceCorpusIds = [SOURCE_CORPUS_ID_1], TargetCorpusIds = [TARGET_CORPUS_ID] }; await client.UpdateParallelCorpusAsync(ECHO_ENGINE1_ID, DOES_NOT_EXIST_CORPUS_ID, updateConfig); @@ -900,10 +916,10 @@ public void UpdateParallelCorpusByIdForEngineById_NoSuchEngine() { var updateConfig = new TranslationParallelCorpusUpdateConfig { - SourceCorpusIds = [SOURCE_CORPUS_ID], + SourceCorpusIds = [SOURCE_CORPUS_ID_1], TargetCorpusIds = [TARGET_CORPUS_ID] }; - await client.UpdateParallelCorpusAsync(DOES_NOT_EXIST_ENGINE_ID, SOURCE_CORPUS_ID, updateConfig); + await client.UpdateParallelCorpusAsync(DOES_NOT_EXIST_ENGINE_ID, SOURCE_CORPUS_ID_1, updateConfig); }); Assert.That(ex?.StatusCode, Is.EqualTo(404)); } @@ -917,7 +933,7 @@ public void UpdateParallelCorpusByIdForEngineById_NotAuthorized() { var updateConfig = new TranslationParallelCorpusUpdateConfig { - SourceCorpusIds = [SOURCE_CORPUS_ID], + SourceCorpusIds = [SOURCE_CORPUS_ID_1], TargetCorpusIds = [TARGET_CORPUS_ID] }; await client.UpdateParallelCorpusAsync(ECHO_ENGINE1_ID, DOES_NOT_EXIST_CORPUS_ID, updateConfig); @@ -1010,7 +1026,7 @@ public void GetParallelCorpusByIdForEngineById_NoSuchEngine() { TranslationParallelCorpus result_afterAdd = await client.GetParallelCorpusAsync( DOES_NOT_EXIST_ENGINE_ID, - SOURCE_CORPUS_ID + SOURCE_CORPUS_ID_1 ); }); Assert.That(ex?.StatusCode, Is.EqualTo(404)); @@ -1085,7 +1101,7 @@ public void DeleteParallelCorpusByIdForEngineById_NoSuchEngine() ServalApiException? ex = Assert.ThrowsAsync(async () => { - await client.DeleteParallelCorpusAsync(DOES_NOT_EXIST_ENGINE_ID, SOURCE_CORPUS_ID); + await client.DeleteParallelCorpusAsync(DOES_NOT_EXIST_ENGINE_ID, SOURCE_CORPUS_ID_1); }); Assert.That(ex?.StatusCode, Is.EqualTo(404)); } @@ -1097,7 +1113,7 @@ public void DeleteParallelCorpusByIdForEngineById_NotAuthorized() ServalApiException? ex = Assert.ThrowsAsync(async () => { - await client.DeleteParallelCorpusAsync(ECHO_ENGINE1_ID, SOURCE_CORPUS_ID); + await client.DeleteParallelCorpusAsync(ECHO_ENGINE1_ID, SOURCE_CORPUS_ID_1); }); Assert.That(ex?.StatusCode, Is.EqualTo(403)); } @@ -1578,13 +1594,13 @@ public async Task StartBuild_ParallelCorpus() new() { ParallelCorpusId = addedCorpus.Id, - SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID, TextIds = ["all"] }] + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1, TextIds = ["all"] }] }; TrainingCorpusConfig tcc = new() { ParallelCorpusId = addedCorpus.Id, - SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID, TextIds = ["all"] }], + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1, TextIds = ["all"] }], TargetFilters = [new() { CorpusId = TARGET_CORPUS_ID, TextIds = ["all"] }] }; ; @@ -1625,13 +1641,13 @@ public async Task StartBuildAsync_ParallelCorpus() new() { ParallelCorpusId = addedCorpus.Id, - SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID, TextIds = ["all"] }] + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1, TextIds = ["all"] }] }; TrainingCorpusConfig tcc = new() { ParallelCorpusId = addedCorpus.Id, - SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID, TextIds = ["all"] }], + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1, TextIds = ["all"] }], TargetFilters = [new() { CorpusId = TARGET_CORPUS_ID, TextIds = ["all"] }] }; ; @@ -1666,12 +1682,12 @@ public async Task StartBuildAsync_Corpus_NoFilter() TranslationEnginesClient client = _env.CreateTranslationEnginesClient(); TranslationCorpus addedCorpus = await client.AddCorpusAsync(NMT_ENGINE1_ID, TestCorpusConfig); PretranslateCorpusConfig ptcc = - new() { CorpusId = addedCorpus.Id, SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID }] }; + new() { CorpusId = addedCorpus.Id, SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1 }] }; TrainingCorpusConfig tcc = new() { CorpusId = addedCorpus.Id, - SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID }], + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1 }], TargetFilters = [new() { CorpusId = TARGET_CORPUS_ID }] }; ; @@ -1717,12 +1733,12 @@ public async Task StartBuildAsync_ParallelCorpus_NoFilter() TestParallelCorpusConfig ); PretranslateCorpusConfig ptcc = - new() { ParallelCorpusId = addedCorpus.Id, SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID }] }; + new() { ParallelCorpusId = addedCorpus.Id, SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1 }] }; TrainingCorpusConfig tcc = new() { ParallelCorpusId = addedCorpus.Id, - SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID }], + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1 }], TargetFilters = [new() { CorpusId = TARGET_CORPUS_ID }] }; ; @@ -1803,7 +1819,7 @@ public async Task StartBuildAsync_ParallelCorpus_PretranslateNoCorpusSpecified() TranslationEnginesClient client = _env.CreateTranslationEnginesClient(); TranslationParallelCorpus addedParallelCorpus = await client.AddParallelCorpusAsync( NMT_ENGINE1_ID, - TestParallelCorpusConfig + TestMixedParallelCorpusConfig ); PretranslateCorpusConfig ptcc = new() { }; TrainingCorpusConfig tcc = new() { ParallelCorpusId = addedParallelCorpus.Id }; @@ -1815,6 +1831,32 @@ public async Task StartBuildAsync_ParallelCorpus_PretranslateNoCorpusSpecified() }); } + [Test] + public async Task StartBuildAsync_ParallelCorpus_PretranslateFilterOnMultipleSources() + { + TranslationEnginesClient client = _env.CreateTranslationEnginesClient(); + TranslationParallelCorpus addedParallelCorpus = await client.AddParallelCorpusAsync( + NMT_ENGINE1_ID, + TestParallelCorpusConfig + ); + PretranslateCorpusConfig ptcc = + new() + { + ParallelCorpusId = addedParallelCorpus.Id, + SourceFilters = + [ + new ParallelCorpusFilterConfig() { CorpusId = SOURCE_CORPUS_ID_1 }, + new ParallelCorpusFilterConfig() { CorpusId = SOURCE_CORPUS_ID_2 } + ] + }; + TrainingCorpusConfig tcc = new() { ParallelCorpusId = addedParallelCorpus.Id }; + TranslationBuildConfig tbc = new TranslationBuildConfig { Pretranslate = [ptcc], TrainOn = [tcc] }; + Assert.ThrowsAsync(async () => + { + await client.StartBuildAsync(NMT_ENGINE1_ID, tbc); + }); + } + [Test] public async Task StartBuildAsync_ParallelCorpus_TrainOnNoCorpusSpecified() { From ea59cd7245dd6659558cbce458cab89782e35333 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 31 Oct 2024 16:22:32 -0400 Subject: [PATCH 3/3] remove FIXME's that are no longer needed. --- .../Services/PreprocessBuildJobTests.cs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index da976acf..d29f2213 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -76,7 +76,6 @@ public async Task RunAsync_PretranslateAll() await env.RunBuildJobAsync(corpus1); - // FIXME This should be 4, but the "don't pretranslate things trained on" logic is not implemented yet. Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2)); } @@ -190,7 +189,6 @@ public async Task RunAsync_MixedSource_Paratext() Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); - // FIXME - this should be 56 (or double check) Assert.That( await env.GetPretranslateCountAsync(), Is.EqualTo(13), @@ -214,7 +212,6 @@ public async Task RunAsync_MixedSource_Text() Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); - // FIXME this should be 9. Assert.That( await env.GetPretranslateCountAsync(), Is.EqualTo(2), @@ -482,7 +479,6 @@ await env.GetTargetExtractAsync(), }); JsonArray? pretranslations = await env.GetPretranslationsAsync(); Assert.That(pretranslations, Is.Not.Null); - // FIXME this should be 37. Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); Assert.That( pretranslations[2]!["translation"]!.ToString(),