Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix up USFM pretranslations for Parallel corpus #529

Merged
merged 3 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,24 @@ public async Task<string> GetUsfmAsync(
{
Engine? engine = await _engines.GetAsync(engineId, cancellationToken);
Corpus? corpus = engine?.Corpora.SingleOrDefault(c => c.Id == corpusId);
if (corpus is null)
throw new EntityNotFoundException($"Could not find the Corpus '{corpusId}' in Engine '{engineId}'.");
ParallelCorpus? parallelCorpus = engine?.ParallelCorpora.SingleOrDefault(c => c.Id == corpusId);

CorpusFile sourceFile = corpus.SourceFiles[0];
CorpusFile targetFile = corpus.TargetFiles[0];
CorpusFile sourceFile;
CorpusFile targetFile;
if (corpus is not null)
{
sourceFile = corpus.SourceFiles[0];
targetFile = corpus.TargetFiles[0];
}
else if (parallelCorpus is not null)
{
sourceFile = parallelCorpus.SourceCorpora[0].Files[0];
targetFile = parallelCorpus.TargetCorpora[0].Files[0];
}
else
{
throw new EntityNotFoundException($"Could not find the Corpus '{corpusId}' in Engine '{engineId}'.");
}
if (sourceFile.Format is not FileFormat.Paratext || targetFile.Format is not FileFormat.Paratext)
throw new InvalidOperationException("USFM format is not valid for non-Scripture corpora.");

Expand Down
8 changes: 7 additions & 1 deletion src/Serval/test/Serval.E2ETests/ServalApiTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ public async Task NmtLargeBatchAndDownload()
engineId,
cId
);
TestContext.WriteLine(lTrans[0].Translation);
Assert.That(lTrans, Has.Count.EqualTo(14));
// Download the model from the s3 bucket
ModelDownloadUrl url = await _helperClient.TranslationEnginesClient.GetModelDownloadUrlAsync(engineId);
using Task<Stream> s = new HttpClient().GetStreamAsync(url.Url);
Expand Down Expand Up @@ -436,6 +436,12 @@ public async Task ParatextProjectNmtJobAsync()
corpus.Id
);
Assert.That(lTrans, Is.Not.Empty);
string usfm = await _helperClient.TranslationEnginesClient.GetPretranslatedUsfmAsync(
engineId,
corpus.Id,
"JHN"
);
Assert.That(usfm, Does.Contain("\\v 1"));
}

[TearDown]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public class PretranslationServiceTests
[Test]
public async Task GetUsfmAsync_Source_PreferExisting()
{
TestEnvironment env = new();
using TestEnvironment env = new();

string usfm = await env.GetUsfmAsync(
PretranslationUsfmTextOrigin.PreferExisting,
Expand All @@ -46,7 +46,7 @@ public async Task GetUsfmAsync_Source_PreferExisting()
[Test]
public async Task GetUsfmAsync_Source_PreferPretranslated()
{
TestEnvironment env = new();
using TestEnvironment env = new();

string usfm = await env.GetUsfmAsync(
PretranslationUsfmTextOrigin.PreferPretranslated,
Expand All @@ -70,7 +70,7 @@ public async Task GetUsfmAsync_Source_PreferPretranslated()
[Test]
public async Task GetUsfmAsync_Source_OnlyExisting()
{
TestEnvironment env = new();
using TestEnvironment env = new();

string usfm = await env.GetUsfmAsync(
PretranslationUsfmTextOrigin.OnlyExisting,
Expand All @@ -94,7 +94,7 @@ public async Task GetUsfmAsync_Source_OnlyExisting()
[Test]
public async Task GetUsfmAsync_Source_OnlyPretranslated()
{
TestEnvironment env = new();
using TestEnvironment env = new();

string usfm = await env.GetUsfmAsync(
PretranslationUsfmTextOrigin.OnlyPretranslated,
Expand All @@ -118,7 +118,7 @@ public async Task GetUsfmAsync_Source_OnlyPretranslated()
[Test]
public async Task GetUsfmAsync_Target_PreferExisting()
{
TestEnvironment env = new();
using TestEnvironment env = new();
env.AddMatthewToTarget();

string usfm = await env.GetUsfmAsync(
Expand All @@ -143,7 +143,7 @@ public async Task GetUsfmAsync_Target_PreferExisting()
[Test]
public async Task GetUsfmAsync_Target_PreferPretranslated()
{
TestEnvironment env = new();
using TestEnvironment env = new();
env.AddMatthewToTarget();

string usfm = await env.GetUsfmAsync(
Expand All @@ -168,7 +168,7 @@ public async Task GetUsfmAsync_Target_PreferPretranslated()
[Test]
public async Task GetUsfmAsync_Target_TargetBookDoesNotExist()
{
TestEnvironment env = new();
using TestEnvironment env = new();

string usfm = await env.GetUsfmAsync(
PretranslationUsfmTextOrigin.PreferPretranslated,
Expand All @@ -181,7 +181,7 @@ public async Task GetUsfmAsync_Target_TargetBookDoesNotExist()
[Test]
public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist()
{
TestEnvironment env = new();
using TestEnvironment env = new();

string usfm = await env.GetUsfmAsync(
PretranslationUsfmTextOrigin.PreferPretranslated,
Expand All @@ -205,7 +205,7 @@ public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist()
[Test]
public async Task GetUsfmAsync_Auto_TargetBookExists()
{
TestEnvironment env = new();
using TestEnvironment env = new();
env.AddMatthewToTarget();

string usfm = await env.GetUsfmAsync(
Expand All @@ -230,7 +230,7 @@ public async Task GetUsfmAsync_Auto_TargetBookExists()
[Test]
public async Task GetUsfmAsync_Target_OnlyExisting()
{
TestEnvironment env = new();
using TestEnvironment env = new();
env.AddMatthewToTarget();

string usfm = await env.GetUsfmAsync(
Expand All @@ -244,7 +244,7 @@ public async Task GetUsfmAsync_Target_OnlyExisting()
[Test]
public async Task GetUsfmAsync_Target_OnlyPretranslated()
{
TestEnvironment env = new();
using TestEnvironment env = new();
env.AddMatthewToTarget();

string usfm = await env.GetUsfmAsync(
Expand All @@ -266,10 +266,26 @@ public async Task GetUsfmAsync_Target_OnlyPretranslated()
);
}

private class TestEnvironment
private class TestEnvironment : IDisposable
{
public TestEnvironment()
{
CorpusFile file1 =
new()
{
Id = "file1",
Filename = "file1.zip",
Format = Shared.Contracts.FileFormat.Paratext,
TextId = "project1"
};
CorpusFile file2 =
new()
{
Id = "file2",
Filename = "file2.zip",
Format = Shared.Contracts.FileFormat.Paratext,
TextId = "project1"
};
Engines = new MemoryRepository<Engine>(
[
new()
Expand All @@ -287,29 +303,45 @@ public TestEnvironment()
Id = "corpus1",
SourceLanguage = "en",
TargetLanguage = "en",
SourceFiles =
[
SourceFiles = [file1],
TargetFiles = [file2],
}
]
},
new()
{
Id = "parallel_engine1",
Owner = "owner1",
SourceLanguage = "en",
TargetLanguage = "en",
Type = "nmt",
ModelRevision = 1,
ParallelCorpora =
[
new()
{
Id = "parallel_corpus1",
SourceCorpora = new List<MonolingualCorpus>()
{
new()
{
Id = "file1",
Filename = "file1.zip",
Format = Shared.Contracts.FileFormat.Paratext,
TextId = "project1"
Id = "src_1",
Language = "en",
Files = [file1],
}
],
TargetFiles =
[
},
TargetCorpora = new List<MonolingualCorpus>()
{
new()
{
Id = "file2",
Filename = "file2.zip",
Format = Shared.Contracts.FileFormat.Paratext,
TextId = "project1"
Id = "trg_1",
Language = "es",
Files = [file2],
}
],
}
}
]
}
},
]
);

Expand All @@ -334,6 +366,26 @@ public TestEnvironment()
TextId = "MAT",
Refs = ["MAT 1:2"],
Translation = "Chapter 1, verse 2."
},
new()
{
Id = "pt3",
EngineRef = "parallel_engine1",
ModelRevision = 1,
CorpusRef = "parallel_corpus1",
TextId = "MAT",
Refs = ["MAT 1:1"],
Translation = "Chapter 1, verse 1."
},
new()
{
Id = "pt4",
EngineRef = "parallel_engine1",
ModelRevision = 1,
CorpusRef = "parallel_corpus1",
TextId = "MAT",
Refs = ["MAT 1:2"],
Translation = "Chapter 1, verse 2."
}
]
);
Expand All @@ -342,23 +394,37 @@ public TestEnvironment()
ScriptureDataFileService.GetParatextProjectSettings("file2.zip").Returns(CreateProjectSettings("TRG"));
var zipSubstituteSource = Substitute.For<IZipContainer>();
var zipSubstituteTarget = Substitute.For<IZipContainer>();
zipSubstituteSource.OpenEntry("MATSRC.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes(SourceUsfm)));
zipSubstituteTarget.OpenEntry("MATTRG.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes("")));
zipSubstituteSource
.OpenEntry("MATSRC.SFM")
.Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(SourceUsfm)));
zipSubstituteTarget.OpenEntry("MATTRG.SFM").Returns(x => new MemoryStream(Encoding.UTF8.GetBytes("")));
zipSubstituteSource.EntryExists(Arg.Any<string>()).Returns(false);
zipSubstituteTarget.EntryExists(Arg.Any<string>()).Returns(false);
zipSubstituteSource.EntryExists("MATSRC.SFM").Returns(true);
zipSubstituteTarget.EntryExists("MATTRG.SFM").Returns(true);
TargetZipContainer = zipSubstituteTarget;
using var textUpdaterSource = new Shared.Services.ZipParatextProjectTextUpdater(
zipSubstituteSource,
CreateProjectSettings("SRC")
);
using var textUpdaterTarget = new Shared.Services.ZipParatextProjectTextUpdater(
zipSubstituteTarget,
CreateProjectSettings("TRG")
);
ScriptureDataFileService.GetZipParatextProjectTextUpdater("file1.zip").Returns(textUpdaterSource);
ScriptureDataFileService.GetZipParatextProjectTextUpdater("file2.zip").Returns(textUpdaterTarget);
TextUpdaters = new List<Shared.Services.ZipParatextProjectTextUpdater>();
Shared.Services.ZipParatextProjectTextUpdater GetTextUpdater(string type)
{
var updater = type switch
{
"SRC"
=> new Shared.Services.ZipParatextProjectTextUpdater(
zipSubstituteSource,
CreateProjectSettings("SRC")
),
"TRG"
=> new Shared.Services.ZipParatextProjectTextUpdater(
zipSubstituteTarget,
CreateProjectSettings("TRG")
),
_ => throw new ArgumentException()
};
TextUpdaters.Add(updater);
return updater;
}
ScriptureDataFileService.GetZipParatextProjectTextUpdater("file1.zip").Returns(x => GetTextUpdater("SRC"));
ScriptureDataFileService.GetZipParatextProjectTextUpdater("file2.zip").Returns(x => GetTextUpdater("TRG"));
Service = new PretranslationService(Pretranslations, Engines, ScriptureDataFileService);
}

Expand All @@ -367,6 +433,7 @@ public TestEnvironment()
public MemoryRepository<Engine> Engines { get; }
public IScriptureDataFileService ScriptureDataFileService { get; }
public IZipContainer TargetZipContainer { get; }
public IList<Shared.Services.ZipParatextProjectTextUpdater> TextUpdaters { get; }

public async Task<string> GetUsfmAsync(
PretranslationUsfmTextOrigin textOrigin,
Expand All @@ -381,12 +448,25 @@ PretranslationUsfmTemplate template
textOrigin: textOrigin,
template: template
);
return usfm.Replace("\r\n", "\n");
usfm = usfm.Replace("\r\n", "\n");
string parallel_usfm = await Service.GetUsfmAsync(
engineId: "parallel_engine1",
modelRevision: 1,
corpusId: "parallel_corpus1",
textId: "MAT",
textOrigin: textOrigin,
template: template
);
parallel_usfm = parallel_usfm.Replace("\r\n", "\n");
Assert.That(parallel_usfm, Is.EqualTo(usfm));
return usfm;
}

public void AddMatthewToTarget()
{
TargetZipContainer.OpenEntry("MATTRG.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes(TargetUsfm)));
TargetZipContainer
.OpenEntry("MATTRG.SFM")
.Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(TargetUsfm)));
}

private static ParatextProjectSettings CreateProjectSettings(string name)
Expand All @@ -406,5 +486,13 @@ private static ParatextProjectSettings CreateProjectSettings(string name)
languageCode: "en"
);
}

public void Dispose()
{
foreach (var updater in TextUpdaters)
{
updater.Dispose();
}
}
}
}
Loading