Skip to content

Commit

Permalink
A better fix for #516. (#521)
Browse files Browse the repository at this point in the history
* A better fix for #516.

* Update documentation

* reviewer comment

* Documentation clarification

* Updated parameter names
  • Loading branch information
johnml1135 authored Oct 29, 2024
1 parent ac1193f commit f7060c7
Show file tree
Hide file tree
Showing 5 changed files with 404 additions and 49 deletions.
38 changes: 30 additions & 8 deletions src/Serval/src/Serval.Client/Client.g.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4437,10 +4437,21 @@ public partial interface ITranslationEnginesClient
/// Starts a build job for a translation engine.
/// </summary>
/// <remarks>
/// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). If no "trainOn" field is provided, all corpora will be used.
/// <br/>Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training.
/// <br/>Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range)
/// <br/>All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information.
/// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either (legacy) corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora).
/// <br/>Specifying a corpus:
/// <br/>* A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId.
/// <br/>* A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters.
/// <br/>
/// <br/>Filtering by textID or chapter:
/// <br/>* Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training.
/// <br/>* Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range)
/// <br/>* All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information.
/// <br/>
/// <br/>Filter - train on all or none
/// <br/>* If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively
/// <br/>* If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used.
/// <br/>* If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used.
/// <br/>* If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively.
/// <br/>
/// <br/>Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation,
/// <br/>the following text will be pretranslated:
Expand Down Expand Up @@ -7217,10 +7228,21 @@ public string BaseUrl
/// Starts a build job for a translation engine.
/// </summary>
/// <remarks>
/// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). If no "trainOn" field is provided, all corpora will be used.
/// <br/>Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training.
/// <br/>Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range)
/// <br/>All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information.
/// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either (legacy) corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora).
/// <br/>Specifying a corpus:
/// <br/>* A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId.
/// <br/>* A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters.
/// <br/>
/// <br/>Filtering by textID or chapter:
/// <br/>* Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training.
/// <br/>* Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range)
/// <br/>* All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information.
/// <br/>
/// <br/>Filter - train on all or none
/// <br/>* If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively
/// <br/>* If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used.
/// <br/>* If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used.
/// <br/>* If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively.
/// <br/>
/// <br/>Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation,
/// <br/>the following text will be pretranslated:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -990,10 +990,21 @@ CancellationToken cancellationToken
/// Starts a build job for a translation engine.
/// </summary>
/// <remarks>
/// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). If no "trainOn" field is provided, all corpora will be used.
/// Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training.
/// Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range)
/// All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information.
/// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either (legacy) corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora).
/// Specifying a corpus:
/// * A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId.
/// * A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters.
///
/// Filtering by textID or chapter:
/// * Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training.
/// * Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range)
/// * All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information.
///
/// Filter - train on all or none
/// * If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively
/// * If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used.
/// * If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used.
/// * If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively.
///
/// Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation,
/// the following text will be pretranslated:
Expand Down
103 changes: 76 additions & 27 deletions src/Serval/src/Serval.Translation/Services/EngineService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,13 @@ public async Task StartBuildAsync(Build build, CancellationToken cancellationTok
Corpora =
{
parallelCorpora.Select(c =>
Map(c, trainOn?.GetValueOrDefault(c.Id), pretranslate?.GetValueOrDefault(c.Id))
Map(
c,
trainOn?.GetValueOrDefault(c.Id),
pretranslate?.GetValueOrDefault(c.Id),
trainOn is null,
pretranslate is null
)
)
}
};
Expand Down Expand Up @@ -276,7 +282,13 @@ public async Task StartBuildAsync(Build build, CancellationToken cancellationTok
Corpora =
{
corpora.Select(c =>
Map(c, trainOn?.GetValueOrDefault(c.Id), pretranslate?.GetValueOrDefault(c.Id))
Map(
c,
trainOn?.GetValueOrDefault(c.Id),
pretranslate?.GetValueOrDefault(c.Id),
trainOn is null,
pretranslate is null
)
)
}
};
Expand Down Expand Up @@ -613,7 +625,13 @@ private Models.WordGraphArc Map(V1.WordGraphArc source)
};
}

private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, PretranslateCorpus? pretranslateCorpus)
private V1.ParallelCorpus Map(
Corpus source,
TrainingCorpus? trainingCorpus,
PretranslateCorpus? pretranslateCorpus,
bool trainOnAllCorpora,
bool pretranslateOnAllCorpora
)
{
IEnumerable<V1.CorpusFile> sourceFiles = source.SourceFiles.Select(Map);
IEnumerable<V1.CorpusFile> targetFiles = source.TargetFiles.Select(Map);
Expand All @@ -622,12 +640,15 @@ private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, Pre
V1.MonolingualCorpus targetCorpus =
new() { Language = source.TargetLanguage, Files = { source.TargetFiles.Select(Map) } };

if (trainingCorpus is null || (trainingCorpus.TextIds is null && trainingCorpus.ScriptureRange is null))
if (
trainOnAllCorpora
|| (trainingCorpus is not null && trainingCorpus.TextIds is null && trainingCorpus.ScriptureRange is null)
)
{
sourceCorpus.TrainOnAll = true;
targetCorpus.TrainOnAll = true;
}
else
else if (trainingCorpus is not null)
{
if (trainingCorpus.TextIds is not null && trainingCorpus.ScriptureRange is not null)
{
Expand Down Expand Up @@ -663,14 +684,18 @@ private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, Pre
}
}
if (
pretranslateCorpus is null
|| (pretranslateCorpus.TextIds is null && pretranslateCorpus.ScriptureRange is null)
pretranslateOnAllCorpora
|| (
pretranslateCorpus is not null
&& pretranslateCorpus.TextIds is null
&& pretranslateCorpus.ScriptureRange is null
)
)
{
sourceCorpus.PretranslateAll = true;
targetCorpus.PretranslateAll = true;
}
else
else if (pretranslateCorpus is not null)
{
if (pretranslateCorpus.TextIds is not null && pretranslateCorpus.ScriptureRange is not null)
{
Expand Down Expand Up @@ -713,14 +738,25 @@ pretranslateCorpus is null
private V1.ParallelCorpus Map(
Models.ParallelCorpus source,
TrainingCorpus? trainingCorpus,
PretranslateCorpus? pretranslateCorpus
PretranslateCorpus? pretranslateCorpus,
bool trainOnAllCorpora,
bool pretranslateOnAllCorpora
)
{
string? referenceFileLocation =
source.TargetCorpora.Count > 0 && source.TargetCorpora[0].Files.Count > 0
? Map(source.TargetCorpora[0].Files[0]).Location
: null;

bool trainOnAllSources =
trainOnAllCorpora || (trainingCorpus is not null && trainingCorpus.SourceFilters is null);
bool pretranslateAllSources =
pretranslateOnAllCorpora || (pretranslateCorpus is not null && pretranslateCorpus.SourceFilters is null);

bool trainOnAllTargets =
trainOnAllCorpora || (trainingCorpus is not null && trainingCorpus.TargetFilters is null);
bool pretranslateAllTargets = pretranslateOnAllCorpora || pretranslateCorpus is not null; // there is no pretranslate Target filter.

return new V1.ParallelCorpus
{
Id = source.Id,
Expand All @@ -731,7 +767,9 @@ private V1.ParallelCorpus Map(
sc,
trainingCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(),
pretranslateCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(),
referenceFileLocation
referenceFileLocation,
trainOnAllSources,
pretranslateAllSources
)
)
},
Expand All @@ -742,18 +780,22 @@ private V1.ParallelCorpus Map(
tc,
trainingCorpus?.TargetFilters?.Where(sf => sf.CorpusRef == tc.Id).FirstOrDefault(),
null,
referenceFileLocation
referenceFileLocation,
trainOnAllTargets,
pretranslateAllTargets
)
)
}
};
}

private V1.MonolingualCorpus Map(
Models.MonolingualCorpus source,
Models.MonolingualCorpus inputCorpus,
ParallelCorpusFilter? trainingFilter,
ParallelCorpusFilter? pretranslateFilter,
string? referenceFileLocation
string? referenceFileLocation,
bool trainOnAll,
bool pretranslateOnAll
)
{
Dictionary<string, ScriptureChapters>? trainOnChapters = null;
Expand Down Expand Up @@ -794,41 +836,48 @@ pretranslateFilter is not null
.ToDictionary();
}

var corpus = new V1.MonolingualCorpus
var returnCorpus = new V1.MonolingualCorpus
{
Id = source.Id,
Language = source.Language,
Files = { source.Files.Select(Map) }
Id = inputCorpus.Id,
Language = inputCorpus.Language,
Files = { inputCorpus.Files.Select(Map) }
};

if (trainingFilter is null || (trainingFilter.TextIds is null && trainingFilter.ScriptureRange is null))
if (
trainOnAll
|| (trainingFilter is not null && trainingFilter.TextIds is null && trainingFilter.ScriptureRange is null)
)
{
corpus.TrainOnAll = true;
returnCorpus.TrainOnAll = true;
}
else
{
if (trainOnChapters is not null)
corpus.TrainOnChapters.Add(trainOnChapters);
returnCorpus.TrainOnChapters.Add(trainOnChapters);
if (trainingFilter?.TextIds is not null)
corpus.TrainOnTextIds.Add(trainingFilter.TextIds);
returnCorpus.TrainOnTextIds.Add(trainingFilter.TextIds);
}

if (
pretranslateFilter is null
|| (pretranslateFilter.TextIds is null && pretranslateFilter.ScriptureRange is null)
pretranslateOnAll
|| (
pretranslateFilter is not null
&& pretranslateFilter.TextIds is null
&& pretranslateFilter.ScriptureRange is null
)
)
{
corpus.PretranslateAll = true;
returnCorpus.PretranslateAll = true;
}
else
{
if (pretranslateChapters is not null)
corpus.PretranslateChapters.Add(pretranslateChapters);
returnCorpus.PretranslateChapters.Add(pretranslateChapters);
if (pretranslateFilter?.TextIds is not null)
corpus.PretranslateTextIds.Add(pretranslateFilter.TextIds);
returnCorpus.PretranslateTextIds.Add(pretranslateFilter.TextIds);
}

return corpus;
return returnCorpus;
}

private V1.CorpusFile Map(Models.CorpusFile source)
Expand Down
9 changes: 7 additions & 2 deletions src/Serval/test/Serval.E2ETests/ServalApiTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,16 @@ public async Task NmtBatch()
_helperClient.TranslationBuildConfig.Pretranslate = [new() { CorpusId = cId2, TextIds = ["2JN.txt"] }];
await _helperClient.BuildEngineAsync(engineId);
await Task.Delay(1000);
IList<Pretranslation> lTrans = await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync(
IList<Pretranslation> lTrans1 = await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync(
engineId,
cId1
);
Assert.That(lTrans1, Has.Count.EqualTo(0)); // should be nothing
IList<Pretranslation> lTrans2 = await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync(
engineId,
cId2
);
Assert.That(lTrans, Has.Count.EqualTo(13)); // just 2 John
Assert.That(lTrans2, Has.Count.EqualTo(13)); // just 2 John
}

[Test]
Expand Down
Loading

0 comments on commit f7060c7

Please sign in to comment.