Skip to content

Commit

Permalink
Fix up USFM pretranslations for Parallel corpus (#529)
Browse files Browse the repository at this point in the history
* Fix up USFM pretranslations for Parallel corpus

* Make 'use first source' consistent across preprocessing & add check

* remove FIXME's that are no longer needed.

---------

Co-authored-by: Enkidu93 <[email protected]>
  • Loading branch information
johnml1135 and Enkidu93 authored Oct 31, 2024
1 parent b8277f2 commit f872bfa
Show file tree
Hide file tree
Showing 7 changed files with 267 additions and 85 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,16 @@ row.Ref is not ScriptureRef sr
);
})
.ToArray();
ITextCorpus[] sourcePretranslateCorpora = sourceCorpora
ITextCorpus? sourcePretranslateCorpus = sourceCorpora
.Select(sc =>
{
ITextCorpus textCorpus = sc.TextCorpus;
if (sc.Corpus.PretranslateTextIds is not null)
textCorpus = textCorpus.FilterTexts(sc.Corpus.PretranslateTextIds);
{
textCorpus = textCorpus.FilterTexts(
sc.Corpus.PretranslateTextIds.Except(sc.Corpus.TrainOnTextIds ?? new())
);
}
return textCorpus.Where(row =>
row.Ref is not ScriptureRef sr
|| sc.Corpus.PretranslateChapters is null
Expand All @@ -154,7 +158,8 @@ row.Ref is not ScriptureRef sr
)
);
})
.ToArray();
.ToArray()
.FirstOrDefault();

(MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus
.TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)))
Expand Down Expand Up @@ -254,11 +259,13 @@ void WriteRow(Utf8JsonWriter writer, string textId, IReadOnlyList<object> refs,

ITextCorpus targetCorpus =
targetCorpora.Length > 0 ? targetCorpora[0].TextCorpus : new DictionaryTextCorpus();

foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpus))
if (sourcePretranslateCorpus != null)
{
if (row.SourceSegment.Length > 0)
WriteRow(pretranslateWriter, row.TextId, row.Refs, row.SourceSegment);
foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpus, targetCorpus))
{
if (row.SourceSegment.Length > 0 && (row.TargetSegment.Length == 0 || !targetCorpus.Any()))
WriteRow(pretranslateWriter, row.TextId, row.Refs, row.SourceSegment);
}
}
}

Expand Down Expand Up @@ -415,14 +422,18 @@ IReadOnlyList<ITextCorpus> trgCorpora
}
}

private static IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus[] srcCorpora, ITextCorpus trgCorpus)
private static IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus srcCorpus, ITextCorpus trgCorpus)
{
int rowCount = 0;
StringBuilder srcSegBuffer = new();
StringBuilder trgSegBuffer = new();
List<object> refs = [];
string textId = "";
foreach (ParallelTextRow row in srcCorpora.SelectMany(sc => sc.AlignRows(trgCorpus, allSourceRows: true)))

srcCorpus = srcCorpus.Transform(CleanSegment);
trgCorpus = trgCorpus.Transform(CleanSegment);

foreach (ParallelTextRow row in srcCorpus.AlignRows(trgCorpus, allSourceRows: true))
{
if (!row.IsTargetRangeStart && row.IsTargetInRange)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ public async Task RunAsync_PretranslateAll()

await env.RunBuildJobAsync(corpus1);

// FIXME This should be 4, but the "don't pretranslate things trained on" logic is not implemented yet.
Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2));
}

Expand Down Expand Up @@ -190,8 +189,11 @@ public async Task RunAsync_MixedSource_Paratext()
Assert.That(trgCount, Is.EqualTo(1));
Assert.That(termCount, Is.EqualTo(0));
});
// FIXME - this should be 56 (or double check)
Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(30));
Assert.That(
await env.GetPretranslateCountAsync(),
Is.EqualTo(13),
(await env.GetPretranslationsAsync())?.ToJsonString()
);
}

[Test]
Expand All @@ -210,8 +212,11 @@ public async Task RunAsync_MixedSource_Text()
Assert.That(trgCount, Is.EqualTo(1));
Assert.That(termCount, Is.EqualTo(0));
});
// FIXME this should be 9.
Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(5));
Assert.That(
await env.GetPretranslateCountAsync(),
Is.EqualTo(2),
(await env.GetPretranslationsAsync())?.ToJsonString()
);
}

[Test]
Expand Down Expand Up @@ -474,8 +479,7 @@ await env.GetTargetExtractAsync(),
});
JsonArray? pretranslations = await env.GetPretranslationsAsync();
Assert.That(pretranslations, Is.Not.Null);
// FIXME this should be 37.
Assert.That(pretranslations!.Count, Is.EqualTo(24), pretranslations.ToJsonString());
Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString());
Assert.That(
pretranslations[2]!["translation"]!.ToString(),
Is.EqualTo("Source one, chapter twelve, verse one.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1372,6 +1372,24 @@ private static Build Map(Engine engine, TranslationBuildConfigDto source)
$"The parallel corpus {pcc.ParallelCorpusId} is not valid: This parallel corpus does not exist for engine {engine.Id}."
);
}
if (
pcc.SourceFilters != null
&& pcc.SourceFilters.Count > 0
&& (
pcc.SourceFilters.Select(sf => sf.CorpusId).Distinct().Count() > 1
|| pcc.SourceFilters[0].CorpusId
!= engine
.ParallelCorpora.Where(pc => pc.Id == pcc.ParallelCorpusId)
.First()
.SourceCorpora[0]
.Id
)
)
{
throw new InvalidOperationException(
$"Only the first source corpus in a parallel corpus may be filtered for pretranslation."
);
}
pretranslateCorpora.Add(
new PretranslateCorpus
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,24 @@ public async Task<string> GetUsfmAsync(
{
Engine? engine = await _engines.GetAsync(engineId, cancellationToken);
Corpus? corpus = engine?.Corpora.SingleOrDefault(c => c.Id == corpusId);
if (corpus is null)
throw new EntityNotFoundException($"Could not find the Corpus '{corpusId}' in Engine '{engineId}'.");
ParallelCorpus? parallelCorpus = engine?.ParallelCorpora.SingleOrDefault(c => c.Id == corpusId);

CorpusFile sourceFile = corpus.SourceFiles[0];
CorpusFile targetFile = corpus.TargetFiles[0];
CorpusFile sourceFile;
CorpusFile targetFile;
if (corpus is not null)
{
sourceFile = corpus.SourceFiles[0];
targetFile = corpus.TargetFiles[0];
}
else if (parallelCorpus is not null)
{
sourceFile = parallelCorpus.SourceCorpora[0].Files[0];
targetFile = parallelCorpus.TargetCorpora[0].Files[0];
}
else
{
throw new EntityNotFoundException($"Could not find the Corpus '{corpusId}' in Engine '{engineId}'.");
}
if (sourceFile.Format is not FileFormat.Paratext || targetFile.Format is not FileFormat.Paratext)
throw new InvalidOperationException("USFM format is not valid for non-Scripture corpora.");

Expand Down
Loading

0 comments on commit f872bfa

Please sign in to comment.