Skip to content

Commit

Permalink
Make lang tag to NLLB lang code conversion more robust (#123)
Browse files Browse the repository at this point in the history
- properly handle languages with an implicit script
- normalize Chinese subtags
- convert macrolanguage subtag to corresponding standard language subtag
  • Loading branch information
ddaspit authored Oct 24, 2023
1 parent e06bee2 commit c9e43f6
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

public class NmtClearMLBuildJobFactory : IClearMLBuildJobFactory
{
private static readonly Dictionary<string, string> Macrolanguages = new Dictionary<string, string>
{
{ "ar", "arb" },
{ "ms", "zsm" },
{ "lv", "lvs" },
{ "ne", "npi" },
{ "sw", "swh" }
};

private readonly ISharedFileService _sharedFileService;
private readonly IRepository<TranslationEngine> _engines;
private readonly IOptionsMonitor<ClearMLOptions> _options;
Expand Down Expand Up @@ -58,13 +67,30 @@ private static string ConvertLanguageTag(string languageTag)
if (
!IetfLanguageTag.TryGetSubtags(
languageTag,
out LanguageSubtag languageSubtag,
out ScriptSubtag scriptSubtag,
out LanguageSubtag? languageSubtag,
out ScriptSubtag? scriptSubtag,
out _,
out _
)
)
{
return languageTag;
}

// Normalize Mandarin Chinese subtag to Chinese subtag
if (languageSubtag.Code == "cmn")
languageSubtag = StandardSubtags.RegisteredLanguages["zh"];
// Normalize macrolanguage subtag to the corresponding standard language subtag
else if (Macrolanguages.TryGetValue(languageSubtag.Code, out string? standardLanguageCode))
languageSubtag = StandardSubtags.RegisteredLanguages[standardLanguageCode];

if (scriptSubtag is null)
{
// if Chinese is specified without a script/region, then default to Simplified Chinese
if (languageSubtag.Code == "zh")
return "zho_Hans";
return languageTag;
}

// Convert to NLLB language codes
return $"{languageSubtag.Iso3Code}_{scriptSubtag.Code}";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
namespace SIL.Machine.AspNetCore.Services;

[TestFixture]
public class NmtClearMLBuildJobFactoryTests
{
[Test]
public async Task CreateJobScriptAsync_Iso639_1Code()
{
var env = new TestEnvironment();
env.AddEngine("es");
string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train");
Assert.That(script, Contains.Substring("'trg_lang': 'spa_Latn'"));
}

[Test]
public async Task CreateJobScriptAsync_Iso639_3Code()
{
var env = new TestEnvironment();
env.AddEngine("hne");
string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train");
Assert.That(script, Contains.Substring("'trg_lang': 'hne_Deva'"));
}

[Test]
public async Task CreateJobScriptAsync_ScriptCode()
{
var env = new TestEnvironment();
env.AddEngine("ks-Arab");
string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train");
Assert.That(script, Contains.Substring("'trg_lang': 'kas_Arab'"));
}

[Test]
public async Task CreateJobScriptAsync_InvalidLangTag()
{
var env = new TestEnvironment();
env.AddEngine("srp_Cyrl");
string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train");
Assert.That(script, Contains.Substring("'trg_lang': 'srp_Cyrl'"));
}

[Test]
public async Task CreateJobScriptAsync_ChineseNoScript()
{
var env = new TestEnvironment();
env.AddEngine("zh");
string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train");
Assert.That(script, Contains.Substring("'trg_lang': 'zho_Hans'"));
}

[Test]
public async Task CreateJobScriptAsync_ChineseScript()
{
var env = new TestEnvironment();
env.AddEngine("zh-Hant");
string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train");
Assert.That(script, Contains.Substring("'trg_lang': 'zho_Hant'"));
}

[Test]
public async Task CreateJobScriptAsync_ChineseRegion()
{
var env = new TestEnvironment();
env.AddEngine("zh-TW");
string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train");
Assert.That(script, Contains.Substring("'trg_lang': 'zho_Hant'"));
}

[Test]
public async Task CreateJobScriptAsync_MandarinChineseNoScript()
{
var env = new TestEnvironment();
env.AddEngine("cmn");
string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train");
Assert.That(script, Contains.Substring("'trg_lang': 'zho_Hans'"));
}

[Test]
public async Task CreateJobScriptAsync_MandarinChineseScript()
{
var env = new TestEnvironment();
env.AddEngine("cmn-Hant");
string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train");
Assert.That(script, Contains.Substring("'trg_lang': 'zho_Hant'"));
}

[Test]
public async Task CreateJobScriptAsync_Macrolanguage()
{
var env = new TestEnvironment();
env.AddEngine("ms");
string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train");
Assert.That(script, Contains.Substring("'trg_lang': 'zsm_Latn'"));
}

private class TestEnvironment
{
public TestEnvironment()
{
if (!Sldr.IsInitialized)
Sldr.Initialize(offlineMode: true);

Engines = new MemoryRepository<TranslationEngine>();

SharedFileService = new SharedFileService(Substitute.For<ILoggerFactory>());
var clearMLOptions = Substitute.For<IOptionsMonitor<ClearMLOptions>>();
clearMLOptions.CurrentValue.Returns(new ClearMLOptions());
Factory = new NmtClearMLBuildJobFactory(SharedFileService, Engines, clearMLOptions);
}

public NmtClearMLBuildJobFactory Factory { get; }
public MemoryRepository<TranslationEngine> Engines { get; }
public ISharedFileService SharedFileService { get; }

public void AddEngine(string targetLanguage)
{
Engines.Add(
new TranslationEngine
{
Id = "engine1",
EngineId = "engine1",
SourceLanguage = "en",
TargetLanguage = targetLanguage,
BuildRevision = 1
}
);
}
}
}

0 comments on commit c9e43f6

Please sign in to comment.