From 5c6950af3e42cbacb34df50f32847c72cc3543fd Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 11:19:09 -0500 Subject: [PATCH] Address reviewer comments --- .../ParallelCorpusPreprocessingService.cs | 88 ++++++++++--------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index 934987c7..9f1439ff 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -37,60 +37,26 @@ public void Preprocess( (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus .SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) .ToArray(); + + if (sourceCorpora.Length == 0) + continue; + ITextCorpus[] sourceTrainingCorpora = sourceCorpora - .Select(sc => - { - ITextCorpus textCorpus = sc.TextCorpus.Transform(CleanSegment); - if (sc.Corpus.TrainOnTextIds is not null) - return textCorpus.FilterTexts(sc.Corpus.TrainOnTextIds); - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || sc.Corpus.TrainOnChapters is null - || IsInChapters(sr, sc.Corpus.TrainOnChapters) - ); - }) + .Select(sc => FilterTrainingCorpora(sc.Corpus, sc.TextCorpus)) .ToArray(); + ITextCorpus[] sourcePretranslateCorpora = sourceCorpora - .Select(sc => - { - ITextCorpus textCorpus = sc.TextCorpus.Transform(CleanSegment); - if (sc.Corpus.PretranslateTextIds is not null) - { - return textCorpus.FilterTexts( - sc.Corpus.PretranslateTextIds.Except(sc.Corpus.TrainOnTextIds ?? new()) - ); - } - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || sc.Corpus.PretranslateChapters is null - || ( - IsInChapters(sr, sc.Corpus.PretranslateChapters) - && !IsInChapters(sr, sc.Corpus.TrainOnChapters ?? new()) - ) - ); - }) + .Select(sc => FilterPretranslateCorpora(sc.Corpus, sc.TextCorpus)) .ToArray(); (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus .TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) .ToArray(); + ITextCorpus[] targetTrainingCorpora = targetCorpora - .Select(tc => - { - ITextCorpus textCorpus = tc.TextCorpus.Transform(CleanSegment); - if (tc.Corpus.TrainOnTextIds is not null) - return textCorpus = textCorpus.FilterTexts(tc.Corpus.TrainOnTextIds); - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || tc.Corpus.TrainOnChapters is null - || IsInChapters(sr, tc.Corpus.TrainOnChapters) - ); - }) + .Select(tc => FilterTrainingCorpora(tc.Corpus, tc.TextCorpus)) .ToArray(); - if (sourceCorpora.Length == 0) - continue; - ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed); if (sourceTrainingCorpus.IsScripture()) { @@ -145,6 +111,42 @@ row.Ref is not ScriptureRef sr } } + private static ITextCorpus FilterPretranslateCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) + { + textCorpus = textCorpus.Transform(CleanSegment); + if (corpus.PretranslateTextIds is not null) + { + return textCorpus.FilterTexts(corpus.PretranslateTextIds.Except(corpus.TrainOnTextIds ?? new())); + } + if (corpus.PretranslateChapters is not null) + { + return textCorpus + .FilterTexts(corpus.PretranslateChapters.Keys) + .Where(row => + row.Ref is not ScriptureRef sr + || ( + IsInChapters(sr, corpus.PretranslateChapters) + && !IsInChapters(sr, corpus.TrainOnChapters ?? new()) + ) + ); + } + return textCorpus; + } + + private static ITextCorpus FilterTrainingCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) + { + textCorpus = textCorpus.Transform(CleanSegment); + if (corpus.TrainOnTextIds is not null) + return textCorpus.FilterTexts(corpus.TrainOnTextIds); + if (corpus.TrainOnChapters is not null) + { + return textCorpus + .FilterTexts(corpus.TrainOnChapters.Keys) + .Where(row => row.Ref is not ScriptureRef sr || IsInChapters(sr, corpus.TrainOnChapters)); + } + return textCorpus; + } + private static IEnumerable CollapseRanges(ParallelTextRow[] rows) { StringBuilder srcSegBuffer = new();