diff --git a/.github/workflows/ci-e2e.yml b/.github/workflows/ci-e2e.yml index 472e33d0..fc2a72df 100644 --- a/.github/workflows/ci-e2e.yml +++ b/.github/workflows/ci-e2e.yml @@ -10,7 +10,7 @@ jobs: build: name: Build runs-on: ubuntu-latest - timeout-minutes: 45 + timeout-minutes: 60 env: SERVAL_CLIENT_ID: ${{ secrets.SERVAL_CLIENT_ID }} diff --git a/src/Echo/src/EchoTranslationEngine/Program.cs b/src/Echo/src/EchoTranslationEngine/Program.cs index 6c6f3768..a679dfb5 100644 --- a/src/Echo/src/EchoTranslationEngine/Program.cs +++ b/src/Echo/src/EchoTranslationEngine/Program.cs @@ -17,9 +17,6 @@ WebApplication app = builder.Build(); -// Configure the HTTP request pipeline. -app.UseHttpsRedirection(); - app.MapGrpcService(); app.MapGrpcService(); diff --git a/src/Machine/src/Serval.Machine.EngineServer/Program.cs b/src/Machine/src/Serval.Machine.EngineServer/Program.cs index e36db6c2..b03f6575 100644 --- a/src/Machine/src/Serval.Machine.EngineServer/Program.cs +++ b/src/Machine/src/Serval.Machine.EngineServer/Program.cs @@ -35,8 +35,6 @@ var app = builder.Build(); -app.UseHttpsRedirection(); - app.MapServalTranslationEngineService(); app.MapHangfireDashboard(); diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index d9e433ce..13547791 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -238,21 +238,33 @@ row.Ref is not ScriptureRef sr } } } - - foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpora[0].TextCorpus)) + void WriteRow(Utf8JsonWriter writer, string textId, IReadOnlyList refs, string translation) + { + writer.WriteStartObject(); + writer.WriteString("corpusId", corpus.Id); + writer.WriteString("textId", textId); + writer.WriteStartArray("refs"); + foreach (object rowRef in refs) + writer.WriteStringValue(rowRef.ToString()); + writer.WriteEndArray(); + writer.WriteString("translation", translation); + writer.WriteEndObject(); + pretranslateCount++; + } + if (targetCorpora.Length == 0) + { + foreach (Row row in GetPretranslateCorpusNoTarget(sourcePretranslateCorpora)) + { + if (row.SourceSegment.Length > 0) + WriteRow(pretranslateWriter, row.TextId, row.Refs, row.SourceSegment); + } + } + else { - if (row.SourceSegment.Length > 0) + foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpora[0].TextCorpus)) { - pretranslateWriter.WriteStartObject(); - pretranslateWriter.WriteString("corpusId", corpus.Id); - pretranslateWriter.WriteString("textId", row.TextId); - pretranslateWriter.WriteStartArray("refs"); - foreach (object rowRef in row.Refs) - pretranslateWriter.WriteStringValue(rowRef.ToString()); - pretranslateWriter.WriteEndArray(); - pretranslateWriter.WriteString("translation", row.SourceSegment); - pretranslateWriter.WriteEndObject(); - pretranslateCount++; + if (row.SourceSegment.Length > 0) + WriteRow(pretranslateWriter, row.TextId, row.Refs, row.SourceSegment); } } } @@ -454,6 +466,47 @@ private static IEnumerable AlignPretranslateCorpus(ITextCorpus[] srcCorpora yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); } + private static IEnumerable GetPretranslateCorpusNoTarget(ITextCorpus[] srcCorpora) + { + int rowCount = 0; + StringBuilder srcSegBuffer = new(); + List refs = []; + string textId = ""; + foreach (TextRow row in srcCorpora.SelectMany(sc => sc)) + { + if (!row.IsRangeStart && row.IsInRange) + { + refs.Add(row.Ref); + if (row.Segment.Count > 0) + { + if (srcSegBuffer.Length > 0) + srcSegBuffer.Append(' '); + srcSegBuffer.Append(string.Join(" ", row.Segment)); + } + rowCount++; + } + else + { + if (rowCount > 0) + { + yield return new(textId, refs, srcSegBuffer.ToString(), "", 1); + textId = ""; + srcSegBuffer.Clear(); + refs.Clear(); + rowCount = 0; + } + + textId = row.TextId; + refs.Add(row.Ref); + srcSegBuffer.Append(string.Join(" ", row.Segment)); + rowCount++; + } + } + + if (rowCount > 0) + yield return new(textId, refs, srcSegBuffer.ToString(), "", 1); + } + private record Row( string TextId, IReadOnlyList Refs, diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs b/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs index bced613b..83369376 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs @@ -295,7 +295,7 @@ private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus sou var pretranslateTextIds = source.PretranslateTextIds.ToHashSet(); FilterChoice pretranslateFilter = GetFilterChoice(pretranslateChapters, pretranslateTextIds); - return new Models.MonolingualCorpus + var corpus = new Models.MonolingualCorpus { Id = source.Id, Language = source.Language, @@ -305,6 +305,17 @@ private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus sou PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null, PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null }; + if (source.PretranslateAll) + { + corpus.PretranslateChapters = null; + corpus.PretranslateTextIds = null; + } + if (source.TrainOnAll) + { + corpus.TrainOnChapters = null; + corpus.TrainOnTextIds = null; + } + return corpus; } private static Models.CorpusFile Map(Translation.V1.CorpusFile source) diff --git a/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/engine.proto b/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/engine.proto index 98918f0c..609a3fc0 100644 --- a/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/engine.proto +++ b/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/engine.proto @@ -161,6 +161,8 @@ message ParallelCorpus { message MonolingualCorpus { string id = 1; string language = 2; + bool train_on_all = 3; + bool pretranslate_all = 4; map train_on_chapters = 5; map pretranslate_chapters = 6; repeated string train_on_text_ids = 7; diff --git a/src/Serval/src/Serval.Translation/Services/EngineService.cs b/src/Serval/src/Serval.Translation/Services/EngineService.cs index 47c4ab9b..6c8c3059 100644 --- a/src/Serval/src/Serval.Translation/Services/EngineService.cs +++ b/src/Serval/src/Serval.Translation/Services/EngineService.cs @@ -601,7 +601,12 @@ private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, Pre V1.MonolingualCorpus targetCorpus = new() { Language = source.TargetLanguage, Files = { source.TargetFiles.Select(Map) } }; - if (trainingCorpus != null) + if (trainingCorpus == null) + { + sourceCorpus.TrainOnAll = true; + targetCorpus.TrainOnAll = true; + } + else { if (trainingCorpus.TextIds is not null && trainingCorpus.ScriptureRange is not null) { @@ -636,7 +641,12 @@ private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, Pre targetCorpus.TrainOnChapters.Add(chapters); } } - if (pretranslateCorpus != null) + if (pretranslateCorpus == null) + { + sourceCorpus.PretranslateAll = true; + targetCorpus.PretranslateAll = true; + } + else { if (pretranslateCorpus.TextIds is not null && pretranslateCorpus.ScriptureRange is not null) { diff --git a/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs b/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs index ae70f6ce..31c971e9 100644 --- a/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs +++ b/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs @@ -3,6 +3,7 @@ namespace Serval.E2ETests; public class ServalClientHelper : IAsyncDisposable { public DataFilesClient DataFilesClient { get; } + public CorporaClient CorporaClient { get; } public TranslationEnginesClient TranslationEnginesClient { get; } public TranslationEngineTypesClient TranslationEngineTypesClient { get; } @@ -32,6 +33,7 @@ public ServalClientHelper(string audience, string prefix = "SCE_", bool ignoreSS _httpClient.BaseAddress = new Uri(hostUrl); _httpClient.Timeout = TimeSpan.FromSeconds(60); DataFilesClient = new DataFilesClient(_httpClient); + CorporaClient = new CorporaClient(_httpClient); TranslationEnginesClient = new TranslationEnginesClient(_httpClient); TranslationEngineTypesClient = new TranslationEngineTypesClient(_httpClient); _prefix = prefix; @@ -229,6 +231,77 @@ bool pretranslate return response.Id; } + public async Task AddParallelTextCorpusToEngineAsync( + string engineId, + string[] filesToAdd, + string sourceLanguage, + string targetLanguage, + bool pretranslate + ) + { + List sourceFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, sourceLanguage); + + var targetFileConfig = new List(); + if (!pretranslate) + { + List targetFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, targetLanguage); + foreach (var item in targetFiles.Select((file, i) => new { i, file })) + { + targetFileConfig.Add(new CorpusFileConfig { FileId = item.file.Id, TextId = filesToAdd[item.i] }); + } + } + + CorpusConfig targetCorpusConfig = + new() + { + Name = "None", + Language = targetLanguage, + Files = targetFileConfig + }; + + var targetCorpus = await CorporaClient.CreateAsync(targetCorpusConfig); + + var sourceFileConfig = new List(); + + if (sourceLanguage == targetLanguage && !pretranslate) + { + // if it's the same language, and we are not pretranslating, do nothing (echo for suggestions) + // if pretranslating, we need to upload the source separately + // if different languages, we are not echoing. + } + else + { + for (int i = 0; i < sourceFiles.Count; i++) + { + sourceFileConfig.Add(new CorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] }); + } + } + + CorpusConfig sourceCorpusConfig = + new() + { + Name = "None", + Language = sourceLanguage, + Files = sourceFileConfig + }; + + var sourceCorpus = await CorporaClient.CreateAsync(sourceCorpusConfig); + + TranslationParallelCorpusConfig parallelCorpusConfig = + new() { SourceCorpusIds = { sourceCorpus.Id }, TargetCorpusIds = { targetCorpus.Id } }; + + var parallelCorpus = await TranslationEnginesClient.AddParallelCorpusAsync(engineId, parallelCorpusConfig); + + if (pretranslate) + { + TranslationBuildConfig.Pretranslate!.Add( + new PretranslateCorpusConfig { CorpusId = parallelCorpus.Id, TextIds = filesToAdd.ToList() } + ); + } + + return parallelCorpus.Id; + } + public async Task> UploadFilesAsync( IEnumerable filesToAdd, FileFormat fileFormat,