Skip to content

Commit

Permalink
Add support for non-verse text segments in Scripture corpora (#179)
Browse files Browse the repository at this point in the history
- add new ScriptureRef corpus ref class
- update Scripture corpora classes to use ScriptureRef
- add ScriptureRefUsfmParserHandlerBase class to track ScriptureRef in USFM
- update UsfmTextUpdater and UsfmTextBase to use ScriptureRefUsfmParserHandlerBase
- add support for updating non-Scripture paragraphs and notes
- update NmtPreprocessBuildJob to support non-Scripture segments

Co-authored-by: John Lambert <[email protected]>
  • Loading branch information
ddaspit and johnml1135 authored Apr 11, 2024
1 parent f9baaa1 commit 95c7759
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public IEnumerable<ITextCorpus> CreateTextCorpora(IReadOnlyList<CorpusFile> file
break;

case FileFormat.Paratext:
corpora.Add(new ParatextBackupTextCorpus(file.Location));
corpora.Add(new ParatextBackupTextCorpus(file.Location, includeAllText: true));
break;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ CancellationToken cancellationToken
continue;

int skipCount = 0;
foreach (Row?[] rows in AlignCorpora(sourceTextCorpora, targetTextCorpus))
foreach (Row?[] rows in AlignTrainCorpus(sourceTextCorpora, targetTextCorpus))
{
if (skipCount > 0)
{
Expand All @@ -153,26 +153,6 @@ CancellationToken cancellationToken
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
trainCount++;
}

Row? pretranslateRow = rows[0];
if (
pretranslateRow is not null
&& IsInPretranslate(pretranslateRow, corpus)
&& pretranslateRow.SourceSegment.Length > 0
&& pretranslateRow.TargetSegment.Length == 0
)
{
pretranslateWriter.WriteStartObject();
pretranslateWriter.WriteString("corpusId", corpus.Id);
pretranslateWriter.WriteString("textId", pretranslateRow.TextId);
pretranslateWriter.WriteStartArray("refs");
foreach (object rowRef in pretranslateRow.Refs)
pretranslateWriter.WriteStringValue(rowRef.ToString());
pretranslateWriter.WriteEndArray();
pretranslateWriter.WriteString("translation", pretranslateRow.SourceSegment);
pretranslateWriter.WriteEndObject();
pretranslateCount++;
}
}

if ((bool?)buildOptionsObject?["use_key_terms"] ?? true)
Expand All @@ -190,6 +170,23 @@ pretranslateRow is not null
}
}
}

foreach (Row row in AlignPretranslateCorpus(sourceTextCorpora[0], targetTextCorpus))
{
if (IsInPretranslate(row, corpus) && row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0)
{
pretranslateWriter.WriteStartObject();
pretranslateWriter.WriteString("corpusId", corpus.Id);
pretranslateWriter.WriteString("textId", row.TextId);
pretranslateWriter.WriteStartArray("refs");
foreach (object rowRef in row.Refs)
pretranslateWriter.WriteStringValue(rowRef.ToString());
pretranslateWriter.WriteEndArray();
pretranslateWriter.WriteString("translation", row.SourceSegment);
pretranslateWriter.WriteEndObject();
pretranslateCount++;
}
}
}
pretranslateWriter.WriteEndArray();

Expand Down Expand Up @@ -244,13 +241,13 @@ private static bool IsIncluded(

private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookChapters, object rowRef)
{
if (rowRef is not VerseRef vr)
if (rowRef is not ScriptureRef sr)
return false;
return bookChapters.TryGetValue(vr.Book, out HashSet<int>? chapters)
&& (chapters.Contains(vr.ChapterNum) || chapters.Count == 0);
return bookChapters.TryGetValue(sr.Book, out HashSet<int>? chapters)
&& (chapters.Contains(sr.ChapterNum) || chapters.Count == 0);
}

private static IEnumerable<Row?[]> AlignCorpora(IReadOnlyList<ITextCorpus> srcCorpora, ITextCorpus trgCorpus)
private static IEnumerable<Row?[]> AlignTrainCorpus(IReadOnlyList<ITextCorpus> srcCorpora, ITextCorpus trgCorpus)
{
if (trgCorpus.IsScripture())
{
Expand Down Expand Up @@ -332,7 +329,7 @@ private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookC
{
yield return new(
vrefs.First().Book,
vrefs.Order().Cast<object>().ToArray(),
vrefs.Order().Select(v => new ScriptureRef(v)).Cast<object>().ToArray(),
srcSegBuffer.ToString(),
trgSegBuffer.ToString(),
rowCount
Expand All @@ -355,7 +352,7 @@ private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookC
{
yield return new(
vrefs.First().Book,
vrefs.Order().Cast<object>().ToArray(),
vrefs.Order().Select(v => new ScriptureRef(v)).Cast<object>().ToArray(),
srcSegBuffer.ToString(),
trgSegBuffer.ToString(),
rowCount
Expand All @@ -365,6 +362,50 @@ private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookC
}
}

private static IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus srcCorpus, ITextCorpus trgCorpus)
{
int rowCount = 0;
StringBuilder srcSegBuffer = new();
StringBuilder trgSegBuffer = new();
List<object> refs = [];
string textId = "";
foreach (ParallelTextRow row in srcCorpus.AlignRows(trgCorpus, allSourceRows: true))
{
if (!row.IsTargetRangeStart && row.IsTargetInRange)
{
refs.AddRange(row.Refs);
if (row.SourceText.Length > 0)
{
if (srcSegBuffer.Length > 0)
srcSegBuffer.Append(' ');
srcSegBuffer.Append(row.SourceText);
}
rowCount++;
}
else
{
if (rowCount > 0)
{
yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1);
textId = "";
srcSegBuffer.Clear();
trgSegBuffer.Clear();
refs.Clear();
rowCount = 0;
}

textId = row.TextId;
refs.AddRange(row.Refs);
srcSegBuffer.Append(row.SourceText);
trgSegBuffer.Append(row.TargetText);
rowCount++;
}
}

if (rowCount > 0)
yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1);
}

private record Row(
string TextId,
IReadOnlyList<object> Refs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public async Task RunAsync_TrainOnAll()
public async Task RunAsync_TrainOnTextIds()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultTextFileCorpus with { TrainOnTextIds = new HashSet<string> { "textId1" } };
Corpus corpus1 = env.DefaultTextFileCorpus with { TrainOnTextIds = ["textId1"] };

await env.RunBuildJobAsync(corpus1);

Expand Down Expand Up @@ -72,7 +72,7 @@ public async Task RunAsync_PretranslateAll()
public async Task RunAsync_PretranslateTextIds()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultTextFileCorpus with { PretranslateTextIds = new HashSet<string> { "textId1" } };
Corpus corpus1 = env.DefaultTextFileCorpus with { PretranslateTextIds = ["textId1"] };

await env.RunBuildJobAsync(corpus1);

Expand Down Expand Up @@ -178,7 +178,7 @@ public async Task RunAsync_MixedSource_Paratext()
Assert.That(trgCount, Is.EqualTo(1));
Assert.That(termCount, Is.EqualTo(0));
});
Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(8));
Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(12));
}

[Test]
Expand Down Expand Up @@ -247,8 +247,8 @@ public TestEnvironment()
TargetLanguage = "en",
PretranslateAll = false,
TrainOnAll = false,
PretranslateTextIds = new HashSet<string>(),
TrainOnTextIds = new HashSet<string>(),
PretranslateTextIds = [],
TrainOnTextIds = [],
SourceFiles = [TextFile("source1")],
TargetFiles = [TextFile("target1")]
};
Expand All @@ -260,8 +260,8 @@ public TestEnvironment()
TargetLanguage = "en",
PretranslateAll = false,
TrainOnAll = false,
PretranslateTextIds = new HashSet<string>(),
TrainOnTextIds = new HashSet<string>(),
PretranslateTextIds = [],
TrainOnTextIds = [],
SourceFiles = [TextFile("source1"), TextFile("source2")],
TargetFiles = [TextFile("target1")]
};
Expand All @@ -273,8 +273,8 @@ public TestEnvironment()
TargetLanguage = "en",
PretranslateAll = false,
TrainOnAll = false,
PretranslateTextIds = new HashSet<string>(),
TrainOnTextIds = new HashSet<string>(),
PretranslateTextIds = [],
TrainOnTextIds = [],
SourceFiles = [ParatextFile("pt-source1")],
TargetFiles = [ParatextFile("pt-target1")]
};
Expand All @@ -286,8 +286,8 @@ public TestEnvironment()
TargetLanguage = "en",
PretranslateAll = false,
TrainOnAll = false,
PretranslateTextIds = new HashSet<string>(),
TrainOnTextIds = new HashSet<string>(),
PretranslateTextIds = [],
TrainOnTextIds = [],
SourceFiles = [ParatextFile("pt-source1"), ParatextFile("pt-source2")],
TargetFiles = [ParatextFile("pt-target1")]
};
Expand Down

0 comments on commit 95c7759

Please sign in to comment.