Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Serval-side mixed source support #497

Merged
merged 45 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
b457393
Initial commit - basic corpora CRUD
Enkidu93 Sep 6, 2024
7c448c4
Integrated CRUD with translation engines
Enkidu93 Sep 11, 2024
bc055a1
Move corpora to data files; implement Serval-side mixing logic (draft)
Enkidu93 Sep 20, 2024
8b4df4a
Tested mixing logic
Enkidu93 Sep 23, 2024
9943811
Fix namespace typo; add corpus service tests
Enkidu93 Sep 23, 2024
a3e10ed
Fix test typo
Enkidu93 Sep 23, 2024
32e5632
Update machine to 3.2.8
johnml1135 Sep 24, 2024
8d9c678
Update QA to 1.6.3
johnml1135 Sep 24, 2024
7d250e3
Update prod to 1.6.3
johnml1135 Sep 25, 2024
52dc1ba
Upgrade to Ubuntu 24.04 (#500)
johnml1135 Sep 30, 2024
84d78ed
Add test/proof-of-concept
Enkidu93 Sep 30, 2024
f7e5925
Remove IsIncluded logic
Enkidu93 Sep 30, 2024
29cb846
Switch to using 'Where' extension
Enkidu93 Sep 30, 2024
7c13dc0
Working machine-side logic implementation; (one outstanding test that…
Enkidu93 Oct 3, 2024
2a6983f
Add unit smoke test for more complex parallel corpus logic
Enkidu93 Oct 3, 2024
9efdc5d
Review changes
Enkidu93 Oct 4, 2024
8d4a121
Undo accidental deletion
Enkidu93 Oct 4, 2024
083d68e
Call 'GetAsksById' once per DoWork (#504)
Enkidu93 Oct 8, 2024
2544cd2
More tests
Enkidu93 Oct 8, 2024
3357c69
Fix typo in test; add pretranslation by text id test
Enkidu93 Oct 8, 2024
d097cbd
Add GetCorpus consumer
Enkidu93 Oct 8, 2024
195b70b
Fix typo in endpoint
Enkidu93 Oct 8, 2024
7195fda
Fix copy-paste error
Enkidu93 Oct 8, 2024
437d5af
Refactor tests; add tests for missing coverage; fix bug discovered du…
Enkidu93 Oct 9, 2024
1d359f1
Initial commit - basic corpora CRUD
Enkidu93 Sep 6, 2024
8b929ba
Integrated CRUD with translation engines
Enkidu93 Sep 11, 2024
8ed2ecb
Move corpora to data files; implement Serval-side mixing logic (draft)
Enkidu93 Sep 20, 2024
5520e0e
Tested mixing logic
Enkidu93 Sep 23, 2024
548ea5b
Fix namespace typo; add corpus service tests
Enkidu93 Sep 23, 2024
aec0670
Fix test typo
Enkidu93 Sep 23, 2024
dd987c7
Add test/proof-of-concept
Enkidu93 Sep 30, 2024
14dfa64
Remove IsIncluded logic
Enkidu93 Sep 30, 2024
70b2a49
Switch to using 'Where' extension
Enkidu93 Sep 30, 2024
8dbaafa
Working machine-side logic implementation; (one outstanding test that…
Enkidu93 Oct 3, 2024
f9db0da
Add unit smoke test for more complex parallel corpus logic
Enkidu93 Oct 3, 2024
c738836
Review changes
Enkidu93 Oct 4, 2024
fb0b6df
Undo accidental deletion
Enkidu93 Oct 4, 2024
1dfc10e
More tests
Enkidu93 Oct 8, 2024
06e7862
Fix typo in test; add pretranslation by text id test
Enkidu93 Oct 8, 2024
cd1b693
Add GetCorpus consumer
Enkidu93 Oct 8, 2024
151abc8
Fix typo in endpoint
Enkidu93 Oct 8, 2024
5dfbe70
Fix copy-paste error
Enkidu93 Oct 8, 2024
bb33813
Refactor tests; add tests for missing coverage; fix bug discovered du…
Enkidu93 Oct 9, 2024
9bf5c26
Merge branch 'mixed_source_support' of https://github.com/sillsdev/se…
Enkidu93 Oct 9, 2024
7eef883
Fix weird merge error
Enkidu93 Oct 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,21 +80,22 @@ await client.BuildStartedAsync(
client.InsertPretranslations(cancellationToken: cancellationToken)
)
{
foreach (Corpus corpus in request.Corpora)
foreach (ParallelCorpus corpus in request.Corpora)
{
if (!corpus.PretranslateAll && corpus.PretranslateTextIds.Count == 0)
continue;

var sourceFiles = corpus
.SourceFiles.Where(f =>
(corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(f.TextId))
&& f.Format == FileFormat.Text
.SourceCorpora.SelectMany(sc =>
sc.Files.Where(f =>
(sc.PretranslateTextIds is null || sc.PretranslateTextIds.Contains(f.TextId))
&& f.Format == FileFormat.Text
)
)
.ToDictionary(f => f.TextId, f => f.Location);
var targetFiles = corpus
.TargetFiles.Where(f =>
(corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(f.TextId))
&& f.Format == FileFormat.Text
.TargetCorpora.SelectMany(tc =>
tc.Files.Where(f =>
(tc.PretranslateTextIds is null || tc.PretranslateTextIds.Contains(f.TextId))
&& f.Format == FileFormat.Text
)
)
.ToDictionary(f => f.TextId, f => f.Location);

Expand Down
14 changes: 0 additions & 14 deletions src/Machine/src/Serval.Machine.Shared/Models/Corpus.cs

This file was deleted.

12 changes: 12 additions & 0 deletions src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
namespace Serval.Machine.Shared.Models;

public record MonolingualCorpus
{
public required string Id { get; set; }
public required string Language { get; set; }
public required IReadOnlyList<CorpusFile> Files { get; set; }
public HashSet<string>? TrainOnTextIds { get; set; }
public Dictionary<string, HashSet<int>>? TrainOnChapters { get; set; }
public HashSet<string>? PretranslateTextIds { get; set; }
public Dictionary<string, HashSet<int>>? PretranslateChapters { get; set; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace Serval.Machine.Shared.Models;

public record ParallelCorpus
{
public required string Id { get; set; }
public IReadOnlyList<MonolingualCorpus> SourceCorpora { get; set; } = new List<MonolingualCorpus>();
public IReadOnlyList<MonolingualCorpus> TargetCorpora { get; set; } = new List<MonolingualCorpus>();
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Task StartBuildAsync(
string engineId,
string buildId,
string? buildOptions,
IReadOnlyList<Corpus> corpora,
IReadOnlyList<ParallelCorpus> corpora,
CancellationToken cancellationToken = default
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public async Task StartBuildAsync(
string engineId,
string buildId,
string? buildOptions,
IReadOnlyList<Corpus> corpora,
IReadOnlyList<ParallelCorpus> corpora,
CancellationToken cancellationToken = default
)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ public Job CreateJob(string engineId, string buildId, BuildStage stage, object?
return stage switch
{
BuildStage.Preprocess
=> CreateJob<NmtPreprocessBuildJob, IReadOnlyList<Corpus>>(
=> CreateJob<NmtPreprocessBuildJob, IReadOnlyList<ParallelCorpus>>(
engineId,
buildId,
"nmt",
Expand Down
160 changes: 90 additions & 70 deletions src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
namespace Serval.Machine.Shared.Services;

public class PreprocessBuildJob : HangfireBuildJob<IReadOnlyList<Corpus>>
public class PreprocessBuildJob : HangfireBuildJob<IReadOnlyList<ParallelCorpus>>
{
private static readonly JsonWriterOptions PretranslateWriterOptions = new() { Indented = true };

Expand Down Expand Up @@ -44,7 +44,7 @@ internal int Seed
protected override async Task DoWorkAsync(
string engineId,
string buildId,
IReadOnlyList<Corpus> data,
IReadOnlyList<Models.ParallelCorpus> data,
string? buildOptions,
IDistributedReaderWriterLock @lock,
CancellationToken cancellationToken
Expand Down Expand Up @@ -103,7 +103,7 @@ CancellationToken cancellationToken

private async Task<(int TrainCount, int PretranslateCount)> WriteDataFilesAsync(
string buildId,
IReadOnlyList<Corpus> corpora,
IReadOnlyList<ParallelCorpus> corpora,
string? buildOptions,
CancellationToken cancellationToken
)
Expand All @@ -125,33 +125,88 @@ CancellationToken cancellationToken
int trainCount = 0;
int pretranslateCount = 0;
pretranslateWriter.WriteStartArray();
foreach (Corpus corpus in corpora)
foreach (ParallelCorpus corpus in corpora)
{
ITextCorpus[] sourceTextCorpora = _corpusService.CreateTextCorpora(corpus.SourceFiles).ToArray();
ITextCorpus targetTextCorpus =
_corpusService.CreateTextCorpora(corpus.TargetFiles).FirstOrDefault() ?? new DictionaryTextCorpus();
(MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus
.SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)))
.ToArray();
ITextCorpus[] sourceTrainingCorpora = sourceCorpora
.Select(sc =>
{
ITextCorpus textCorpus = sc.TextCorpus;
if (sc.Corpus.TrainOnTextIds is not null)
textCorpus = textCorpus.FilterTexts(sc.Corpus.TrainOnTextIds);
return textCorpus.Where(row =>
row.Ref is not ScriptureRef sr
|| sc.Corpus.TrainOnChapters is null
|| IsInChapters(sr, sc.Corpus.TrainOnChapters)
);
})
.ToArray();
ITextCorpus[] sourcePretranslateCorpora = sourceCorpora
.Select(sc =>
{
ITextCorpus textCorpus = sc.TextCorpus;
if (sc.Corpus.PretranslateTextIds is not null)
textCorpus = textCorpus.FilterTexts(sc.Corpus.PretranslateTextIds);
return textCorpus.Where(row =>
row.Ref is not ScriptureRef sr
|| sc.Corpus.PretranslateChapters is null
|| (
IsInChapters(sr, sc.Corpus.PretranslateChapters)
&& !IsInChapters(sr, sc.Corpus.TrainOnChapters ?? new())
)
);
})
.ToArray();

(MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus
.TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)))
.ToArray();
ITextCorpus[] targetTrainingCorpora = targetCorpora
.Select(tc =>
{
ITextCorpus textCorpus = tc.TextCorpus;
if (tc.Corpus.TrainOnTextIds is not null)
textCorpus = textCorpus.FilterTexts(tc.Corpus.TrainOnTextIds);
return textCorpus.Where(row =>
row.Ref is not ScriptureRef sr
|| tc.Corpus.TrainOnChapters is null
|| IsInChapters(sr, tc.Corpus.TrainOnChapters)
);
})
.ToArray();

if (sourceTextCorpora.Length == 0)
if (sourceCorpora.Length == 0)
continue;

int skipCount = 0;
foreach (Row?[] rows in AlignTrainCorpus(corpus, sourceTextCorpora, targetTextCorpus))
foreach (Row?[] rows in AlignTrainCorpus(sourceTrainingCorpora, targetTrainingCorpora))
{
if (skipCount > 0)
{
skipCount--;
continue;
}

Row[] trainRows = rows.Where(r => r is not null && IsInTrain(r, corpus)).Cast<Row>().ToArray();
Row[] trainRows = rows.Where(r => r is not null).Cast<Row>().ToArray();
if (trainRows.Length > 0)
{
Row row = trainRows[0];
if (rows.Length > 1)
{
Row[] nonEmptyRows = trainRows.Where(r => r.SourceSegment.Length > 0).ToArray();
Row[] targetNonEmptyRows = nonEmptyRows.Where(r => r.TargetSegment.Length > 0).ToArray();
if (targetNonEmptyRows.Length > 0)
nonEmptyRows = targetNonEmptyRows;
if (nonEmptyRows.Length > 0)
{
nonEmptyRows = nonEmptyRows
.GroupBy(r => r.SourceSegment)
.Select(group => group.First())
.ToArray();
row = nonEmptyRows[_random.Next(nonEmptyRows.Length)];
}
}

await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
Expand All @@ -164,8 +219,12 @@ CancellationToken cancellationToken

if ((bool?)buildOptionsObject?["use_key_terms"] ?? true)
{
ITextCorpus? sourceTermCorpus = _corpusService.CreateTermCorpora(corpus.SourceFiles).FirstOrDefault();
ITextCorpus? targetTermCorpus = _corpusService.CreateTermCorpora(corpus.TargetFiles).FirstOrDefault();
ITextCorpus? sourceTermCorpus = _corpusService
.CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList())
.FirstOrDefault();
ITextCorpus? targetTermCorpus = _corpusService
.CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList())
.FirstOrDefault();
if (sourceTermCorpus is not null && targetTermCorpus is not null)
{
IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus);
Expand All @@ -178,13 +237,9 @@ CancellationToken cancellationToken
}
}

foreach (Row row in AlignPretranslateCorpus(corpus, sourceTextCorpora[0], targetTextCorpus))
foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpora[0].TextCorpus))
{
if (
IsInPretranslate(row, corpus)
&& row.SourceSegment.Length > 0
&& (row.TargetSegment.Length == 0 || !IsInTrain(row, corpus))
)
if (row.SourceSegment.Length > 0)
{
pretranslateWriter.WriteStartObject();
pretranslateWriter.WriteString("corpusId", corpus.Id);
Expand All @@ -205,10 +260,17 @@ CancellationToken cancellationToken
return (trainCount, pretranslateCount);
}

private static bool IsInChapters(ScriptureRef sr, Dictionary<string, HashSet<int>> selection)
{
return selection.TryGetValue(sr.Book, out HashSet<int>? chapters)
&& chapters != null
&& (chapters.Count == 0 || chapters.Contains(sr.ChapterNum));
}

protected override async Task CleanupAsync(
string engineId,
string buildId,
IReadOnlyList<Corpus> data,
IReadOnlyList<Models.ParallelCorpus> data,
IDistributedReaderWriterLock @lock,
JobCompletionStatus completionStatus
)
Expand All @@ -226,70 +288,33 @@ JobCompletionStatus completionStatus
}
}

private static bool IsInTrain(Row row, Corpus corpus)
{
return IsIncluded(row, corpus.TrainOnTextIds, corpus.TrainOnChapters);
}

private static bool IsInPretranslate(Row row, Corpus corpus)
{
return IsIncluded(row, corpus.PretranslateTextIds, corpus.PretranslateChapters);
}

private static bool IsIncluded(
Row? row,
IReadOnlySet<string>? textIds,
IReadOnlyDictionary<string, HashSet<int>>? chapters
)
{
if (row is null)
return false;
if (chapters is not null)
return row.Refs.Any(r => IsInChapters(chapters, r));
if (textIds is not null)
return textIds.Contains(row.TextId);
return true;
}

private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookChapters, object rowRef)
{
if (rowRef is not ScriptureRef sr)
return false;
return bookChapters.TryGetValue(sr.Book, out HashSet<int>? chapters)
&& (chapters.Contains(sr.ChapterNum) || chapters.Count == 0);
}

private static IEnumerable<Row?[]> AlignTrainCorpus(
Corpus corpus,
IReadOnlyList<ITextCorpus> srcCorpora,
ITextCorpus trgCorpus
IReadOnlyList<ITextCorpus> trgCorpora
)
{
IEnumerable<string>? textIds = corpus.TrainOnChapters is not null
? corpus.TrainOnChapters.Keys
: corpus.TrainOnTextIds;
srcCorpora = srcCorpora.Select(sc => sc.FilterTexts(textIds).Transform(CleanSegment)).ToArray();
trgCorpus = trgCorpus.FilterTexts(textIds).Transform(CleanSegment);
srcCorpora = srcCorpora.Select(sc => sc.Transform(CleanSegment)).ToArray();
trgCorpora = trgCorpora.Select(tc => tc.Transform(CleanSegment)).ToArray();

if (trgCorpus.IsScripture())
if (trgCorpora.All(tc => tc.IsScripture()))
{
return srcCorpora
.Select(sc => AlignScripture(sc, trgCorpus))
.SelectMany(sc => trgCorpora.Select(tc => AlignScripture(sc, tc)))
.ZipMany(rows => rows.ToArray())
// filter out every list that only contains completely empty rows
.Where(rows => rows.Any(r => r is null || r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0));
}

IEnumerable<Row[]> sourceOnlyRows = srcCorpora
.Select(sc => sc.AlignRows(trgCorpus, allSourceRows: true))
.SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allSourceRows: true)))
.ZipMany(rows =>
rows.Where(r => r.TargetSegment.Count == 0)
.Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1))
.ToArray()
);

IEnumerable<Row[]> targetRows = srcCorpora
.Select(sc => sc.AlignRows(trgCorpus, allTargetRows: true))
.SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allTargetRows: true)))
.ZipMany(rows =>
rows.Where(r => r.TargetSegment.Count > 0)
.Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1))
Expand Down Expand Up @@ -384,19 +409,14 @@ ITextCorpus trgCorpus
}
}

private static IEnumerable<Row> AlignPretranslateCorpus(Corpus corpus, ITextCorpus srcCorpus, ITextCorpus trgCorpus)
private static IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus[] srcCorpora, ITextCorpus trgCorpus)
{
IEnumerable<string>? textIds = corpus.PretranslateChapters is not null
? corpus.PretranslateChapters.Keys
: corpus.PretranslateTextIds;
srcCorpus = srcCorpus.FilterTexts(textIds).Transform(CleanSegment);
trgCorpus = trgCorpus.FilterTexts(textIds).Transform(CleanSegment);
int rowCount = 0;
StringBuilder srcSegBuffer = new();
StringBuilder trgSegBuffer = new();
List<object> refs = [];
string textId = "";
foreach (ParallelTextRow row in srcCorpus.AlignRows(trgCorpus, allSourceRows: true))
foreach (ParallelTextRow row in srcCorpora.SelectMany(sc => sc.AlignRows(trgCorpus, allSourceRows: true)))
{
if (!row.IsTargetRangeStart && row.IsTargetInRange)
{
Expand Down
Loading
Loading