Skip to content

Commit

Permalink
Serval-side mixed source support (#497)
Browse files Browse the repository at this point in the history
Mixed source support
  • Loading branch information
Enkidu93 authored Oct 9, 2024
1 parent 083d68e commit 93ca485
Show file tree
Hide file tree
Showing 64 changed files with 6,257 additions and 1,385 deletions.
21 changes: 11 additions & 10 deletions src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,21 +80,22 @@ await client.BuildStartedAsync(
client.InsertPretranslations(cancellationToken: cancellationToken)
)
{
foreach (Corpus corpus in request.Corpora)
foreach (ParallelCorpus corpus in request.Corpora)
{
if (!corpus.PretranslateAll && corpus.PretranslateTextIds.Count == 0)
continue;

var sourceFiles = corpus
.SourceFiles.Where(f =>
(corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(f.TextId))
&& f.Format == FileFormat.Text
.SourceCorpora.SelectMany(sc =>
sc.Files.Where(f =>
(sc.PretranslateTextIds is null || sc.PretranslateTextIds.Contains(f.TextId))
&& f.Format == FileFormat.Text
)
)
.ToDictionary(f => f.TextId, f => f.Location);
var targetFiles = corpus
.TargetFiles.Where(f =>
(corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(f.TextId))
&& f.Format == FileFormat.Text
.TargetCorpora.SelectMany(tc =>
tc.Files.Where(f =>
(tc.PretranslateTextIds is null || tc.PretranslateTextIds.Contains(f.TextId))
&& f.Format == FileFormat.Text
)
)
.ToDictionary(f => f.TextId, f => f.Location);

Expand Down
14 changes: 0 additions & 14 deletions src/Machine/src/Serval.Machine.Shared/Models/Corpus.cs

This file was deleted.

12 changes: 12 additions & 0 deletions src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
namespace Serval.Machine.Shared.Models;

public record MonolingualCorpus
{
public required string Id { get; set; }
public required string Language { get; set; }
public required IReadOnlyList<CorpusFile> Files { get; set; }
public HashSet<string>? TrainOnTextIds { get; set; }
public Dictionary<string, HashSet<int>>? TrainOnChapters { get; set; }
public HashSet<string>? PretranslateTextIds { get; set; }
public Dictionary<string, HashSet<int>>? PretranslateChapters { get; set; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace Serval.Machine.Shared.Models;

public record ParallelCorpus
{
public required string Id { get; set; }
public IReadOnlyList<MonolingualCorpus> SourceCorpora { get; set; } = new List<MonolingualCorpus>();
public IReadOnlyList<MonolingualCorpus> TargetCorpora { get; set; } = new List<MonolingualCorpus>();
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Task StartBuildAsync(
string engineId,
string buildId,
string? buildOptions,
IReadOnlyList<Corpus> corpora,
IReadOnlyList<ParallelCorpus> corpora,
CancellationToken cancellationToken = default
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public async Task StartBuildAsync(
string engineId,
string buildId,
string? buildOptions,
IReadOnlyList<Corpus> corpora,
IReadOnlyList<ParallelCorpus> corpora,
CancellationToken cancellationToken = default
)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ public Job CreateJob(string engineId, string buildId, BuildStage stage, object?
return stage switch
{
BuildStage.Preprocess
=> CreateJob<NmtPreprocessBuildJob, IReadOnlyList<Corpus>>(
=> CreateJob<NmtPreprocessBuildJob, IReadOnlyList<ParallelCorpus>>(
engineId,
buildId,
"nmt",
Expand Down
168 changes: 97 additions & 71 deletions src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
namespace Serval.Machine.Shared.Services;

public class PreprocessBuildJob : HangfireBuildJob<IReadOnlyList<Corpus>>
public class PreprocessBuildJob : HangfireBuildJob<IReadOnlyList<ParallelCorpus>>
{
private static readonly JsonWriterOptions PretranslateWriterOptions = new() { Indented = true };

Expand Down Expand Up @@ -43,7 +43,7 @@ internal int Seed
protected override async Task DoWorkAsync(
string engineId,
string buildId,
IReadOnlyList<Corpus> data,
IReadOnlyList<Models.ParallelCorpus> data,
string? buildOptions,
CancellationToken cancellationToken
)
Expand Down Expand Up @@ -99,7 +99,7 @@ CancellationToken cancellationToken

private async Task<(int TrainCount, int PretranslateCount)> WriteDataFilesAsync(
string buildId,
IReadOnlyList<Corpus> corpora,
IReadOnlyList<ParallelCorpus> corpora,
string? buildOptions,
CancellationToken cancellationToken
)
Expand All @@ -121,33 +121,94 @@ CancellationToken cancellationToken
int trainCount = 0;
int pretranslateCount = 0;
pretranslateWriter.WriteStartArray();
foreach (Corpus corpus in corpora)
foreach (ParallelCorpus corpus in corpora)
{
ITextCorpus[] sourceTextCorpora = _corpusService.CreateTextCorpora(corpus.SourceFiles).ToArray();
ITextCorpus targetTextCorpus =
_corpusService.CreateTextCorpora(corpus.TargetFiles).FirstOrDefault() ?? new DictionaryTextCorpus();
(MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus
.SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)))
.ToArray();
ITextCorpus[] sourceTrainingCorpora = sourceCorpora
.Select(sc =>
{
ITextCorpus textCorpus = sc.TextCorpus;
if (sc.Corpus.TrainOnTextIds is not null)
textCorpus = textCorpus.FilterTexts(sc.Corpus.TrainOnTextIds);
return textCorpus.Where(row =>
row.Ref is not ScriptureRef sr
|| sc.Corpus.TrainOnChapters is null
|| IsInChapters(sr, sc.Corpus.TrainOnChapters)
);
})
.ToArray();
ITextCorpus[] sourcePretranslateCorpora = sourceCorpora
.Select(sc =>
{
ITextCorpus textCorpus = sc.TextCorpus;
if (sc.Corpus.PretranslateTextIds is not null)
textCorpus = textCorpus.FilterTexts(sc.Corpus.PretranslateTextIds);
return textCorpus.Where(row =>
row.Ref is not ScriptureRef sr
|| sc.Corpus.PretranslateChapters is null
|| (
IsInChapters(sr, sc.Corpus.PretranslateChapters)
&& !IsInChapters(sr, sc.Corpus.TrainOnChapters ?? new())
)
);
})
.ToArray();

(MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus
.TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)))
.ToArray();
ITextCorpus[] targetTrainingCorpora = targetCorpora
.Select(tc =>
{
ITextCorpus textCorpus = tc.TextCorpus;
if (tc.Corpus.TrainOnTextIds is not null)
textCorpus = textCorpus.FilterTexts(tc.Corpus.TrainOnTextIds);
return textCorpus.Where(row =>
row.Ref is not ScriptureRef sr
|| tc.Corpus.TrainOnChapters is null
|| IsInChapters(sr, tc.Corpus.TrainOnChapters)
);
})
.ToArray();

if (sourceTextCorpora.Length == 0)
if (sourceCorpora.Length == 0)
continue;

int skipCount = 0;
foreach (Row?[] rows in AlignTrainCorpus(corpus, sourceTextCorpora, targetTextCorpus))
foreach (Row?[] rows in AlignTrainCorpus(sourceTrainingCorpora, targetTrainingCorpora))
{
if (skipCount > 0)
{
skipCount--;
continue;
}

Row[] trainRows = rows.Where(r => r is not null && IsInTrain(r, corpus)).Cast<Row>().ToArray();
Row[] trainRows = rows.Where(r => r is not null).Cast<Row>().ToArray();
if (trainRows.Length > 0)
{
Row row = trainRows[0];
if (rows.Length > 1)
{
Row[] nonEmptyRows = trainRows.Where(r => r.SourceSegment.Length > 0).ToArray();
Row[] targetNonEmptyRows = nonEmptyRows.Where(r => r.TargetSegment.Length > 0).ToArray();
if (targetNonEmptyRows.Length > 0)
nonEmptyRows = targetNonEmptyRows;
if (nonEmptyRows.Length > 0)
row = nonEmptyRows[_random.Next(nonEmptyRows.Length)];
{
nonEmptyRows = nonEmptyRows
.GroupBy(r => r.SourceSegment)
.Select(group => group.First())
.ToArray();
{
nonEmptyRows = nonEmptyRows
.GroupBy(r => r.SourceSegment)
.Select(group => group.First())
.ToArray();
row = nonEmptyRows[_random.Next(nonEmptyRows.Length)];
}
}
}

await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
Expand All @@ -160,8 +221,12 @@ CancellationToken cancellationToken

if ((bool?)buildOptionsObject?["use_key_terms"] ?? true)
{
ITextCorpus? sourceTermCorpus = _corpusService.CreateTermCorpora(corpus.SourceFiles).FirstOrDefault();
ITextCorpus? targetTermCorpus = _corpusService.CreateTermCorpora(corpus.TargetFiles).FirstOrDefault();
ITextCorpus? sourceTermCorpus = _corpusService
.CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList())
.FirstOrDefault();
ITextCorpus? targetTermCorpus = _corpusService
.CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList())
.FirstOrDefault();
if (sourceTermCorpus is not null && targetTermCorpus is not null)
{
IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus);
Expand All @@ -174,13 +239,9 @@ CancellationToken cancellationToken
}
}

foreach (Row row in AlignPretranslateCorpus(corpus, sourceTextCorpora[0], targetTextCorpus))
foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpora[0].TextCorpus))
{
if (
IsInPretranslate(row, corpus)
&& row.SourceSegment.Length > 0
&& (row.TargetSegment.Length == 0 || !IsInTrain(row, corpus))
)
if (row.SourceSegment.Length > 0)
{
pretranslateWriter.WriteStartObject();
pretranslateWriter.WriteString("corpusId", corpus.Id);
Expand All @@ -201,10 +262,17 @@ CancellationToken cancellationToken
return (trainCount, pretranslateCount);
}

private static bool IsInChapters(ScriptureRef sr, Dictionary<string, HashSet<int>> selection)
{
return selection.TryGetValue(sr.Book, out HashSet<int>? chapters)
&& chapters != null
&& (chapters.Count == 0 || chapters.Contains(sr.ChapterNum));
}

protected override async Task CleanupAsync(
string engineId,
string buildId,
IReadOnlyList<Corpus> data,
IReadOnlyList<ParallelCorpus> data,
JobCompletionStatus completionStatus
)
{
Expand All @@ -221,70 +289,33 @@ JobCompletionStatus completionStatus
}
}

private static bool IsInTrain(Row row, Corpus corpus)
{
return IsIncluded(row, corpus.TrainOnTextIds, corpus.TrainOnChapters);
}

private static bool IsInPretranslate(Row row, Corpus corpus)
{
return IsIncluded(row, corpus.PretranslateTextIds, corpus.PretranslateChapters);
}

private static bool IsIncluded(
Row? row,
IReadOnlySet<string>? textIds,
IReadOnlyDictionary<string, HashSet<int>>? chapters
)
{
if (row is null)
return false;
if (chapters is not null)
return row.Refs.Any(r => IsInChapters(chapters, r));
if (textIds is not null)
return textIds.Contains(row.TextId);
return true;
}

private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookChapters, object rowRef)
{
if (rowRef is not ScriptureRef sr)
return false;
return bookChapters.TryGetValue(sr.Book, out HashSet<int>? chapters)
&& (chapters.Contains(sr.ChapterNum) || chapters.Count == 0);
}

private static IEnumerable<Row?[]> AlignTrainCorpus(
Corpus corpus,
IReadOnlyList<ITextCorpus> srcCorpora,
ITextCorpus trgCorpus
IReadOnlyList<ITextCorpus> trgCorpora
)
{
IEnumerable<string>? textIds = corpus.TrainOnChapters is not null
? corpus.TrainOnChapters.Keys
: corpus.TrainOnTextIds;
srcCorpora = srcCorpora.Select(sc => sc.FilterTexts(textIds).Transform(CleanSegment)).ToArray();
trgCorpus = trgCorpus.FilterTexts(textIds).Transform(CleanSegment);
srcCorpora = srcCorpora.Select(sc => sc.Transform(CleanSegment)).ToArray();
trgCorpora = trgCorpora.Select(tc => tc.Transform(CleanSegment)).ToArray();

if (trgCorpus.IsScripture())
if (trgCorpora.All(tc => tc.IsScripture()))
{
return srcCorpora
.Select(sc => AlignScripture(sc, trgCorpus))
.SelectMany(sc => trgCorpora.Select(tc => AlignScripture(sc, tc)))
.ZipMany(rows => rows.ToArray())
// filter out every list that only contains completely empty rows
.Where(rows => rows.Any(r => r is null || r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0));
}

IEnumerable<Row[]> sourceOnlyRows = srcCorpora
.Select(sc => sc.AlignRows(trgCorpus, allSourceRows: true))
.SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allSourceRows: true)))
.ZipMany(rows =>
rows.Where(r => r.TargetSegment.Count == 0)
.Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1))
.ToArray()
);

IEnumerable<Row[]> targetRows = srcCorpora
.Select(sc => sc.AlignRows(trgCorpus, allTargetRows: true))
.SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allTargetRows: true)))
.ZipMany(rows =>
rows.Where(r => r.TargetSegment.Count > 0)
.Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1))
Expand Down Expand Up @@ -379,19 +410,14 @@ ITextCorpus trgCorpus
}
}

private static IEnumerable<Row> AlignPretranslateCorpus(Corpus corpus, ITextCorpus srcCorpus, ITextCorpus trgCorpus)
private static IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus[] srcCorpora, ITextCorpus trgCorpus)
{
IEnumerable<string>? textIds = corpus.PretranslateChapters is not null
? corpus.PretranslateChapters.Keys
: corpus.PretranslateTextIds;
srcCorpus = srcCorpus.FilterTexts(textIds).Transform(CleanSegment);
trgCorpus = trgCorpus.FilterTexts(textIds).Transform(CleanSegment);
int rowCount = 0;
StringBuilder srcSegBuffer = new();
StringBuilder trgSegBuffer = new();
List<object> refs = [];
string textId = "";
foreach (ParallelTextRow row in srcCorpus.AlignRows(trgCorpus, allSourceRows: true))
foreach (ParallelTextRow row in srcCorpora.SelectMany(sc => sc.AlignRows(trgCorpus, allSourceRows: true)))
{
if (!row.IsTargetRangeStart && row.IsTargetInRange)
{
Expand Down
Loading

0 comments on commit 93ca485

Please sign in to comment.