From c9e43f6f57a1aaa8255464411eed33f6db1cc460 Mon Sep 17 00:00:00 2001 From: Damien Daspit Date: Tue, 24 Oct 2023 18:46:39 -0500 Subject: [PATCH] Make lang tag to NLLB lang code conversion more robust (#123) - properly handle languages with an implicit script - normalize Chinese subtags - convert macrolanguage subtag to corresponding standard language subtag --- .../Services/NmtClearMLBuildJobFactory.cs | 30 +++- .../NmtClearMLBuildJobFactoryTests.cs | 129 ++++++++++++++++++ 2 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 src/Machine/test/SIL.Machine.AspNetCore.Tests/Services/NmtClearMLBuildJobFactoryTests.cs diff --git a/src/Machine/src/SIL.Machine.AspNetCore/Services/NmtClearMLBuildJobFactory.cs b/src/Machine/src/SIL.Machine.AspNetCore/Services/NmtClearMLBuildJobFactory.cs index a4801484..1ac39fb8 100644 --- a/src/Machine/src/SIL.Machine.AspNetCore/Services/NmtClearMLBuildJobFactory.cs +++ b/src/Machine/src/SIL.Machine.AspNetCore/Services/NmtClearMLBuildJobFactory.cs @@ -2,6 +2,15 @@ public class NmtClearMLBuildJobFactory : IClearMLBuildJobFactory { + private static readonly Dictionary Macrolanguages = new Dictionary + { + { "ar", "arb" }, + { "ms", "zsm" }, + { "lv", "lvs" }, + { "ne", "npi" }, + { "sw", "swh" } + }; + private readonly ISharedFileService _sharedFileService; private readonly IRepository _engines; private readonly IOptionsMonitor _options; @@ -58,13 +67,30 @@ private static string ConvertLanguageTag(string languageTag) if ( !IetfLanguageTag.TryGetSubtags( languageTag, - out LanguageSubtag languageSubtag, - out ScriptSubtag scriptSubtag, + out LanguageSubtag? languageSubtag, + out ScriptSubtag? scriptSubtag, out _, out _ ) ) + { return languageTag; + } + + // Normalize Mandarin Chinese subtag to Chinese subtag + if (languageSubtag.Code == "cmn") + languageSubtag = StandardSubtags.RegisteredLanguages["zh"]; + // Normalize macrolanguage subtag to the corresponding standard language subtag + else if (Macrolanguages.TryGetValue(languageSubtag.Code, out string? standardLanguageCode)) + languageSubtag = StandardSubtags.RegisteredLanguages[standardLanguageCode]; + + if (scriptSubtag is null) + { + // if Chinese is specified without a script/region, then default to Simplified Chinese + if (languageSubtag.Code == "zh") + return "zho_Hans"; + return languageTag; + } // Convert to NLLB language codes return $"{languageSubtag.Iso3Code}_{scriptSubtag.Code}"; diff --git a/src/Machine/test/SIL.Machine.AspNetCore.Tests/Services/NmtClearMLBuildJobFactoryTests.cs b/src/Machine/test/SIL.Machine.AspNetCore.Tests/Services/NmtClearMLBuildJobFactoryTests.cs new file mode 100644 index 00000000..a89b82b1 --- /dev/null +++ b/src/Machine/test/SIL.Machine.AspNetCore.Tests/Services/NmtClearMLBuildJobFactoryTests.cs @@ -0,0 +1,129 @@ +namespace SIL.Machine.AspNetCore.Services; + +[TestFixture] +public class NmtClearMLBuildJobFactoryTests +{ + [Test] + public async Task CreateJobScriptAsync_Iso639_1Code() + { + var env = new TestEnvironment(); + env.AddEngine("es"); + string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train"); + Assert.That(script, Contains.Substring("'trg_lang': 'spa_Latn'")); + } + + [Test] + public async Task CreateJobScriptAsync_Iso639_3Code() + { + var env = new TestEnvironment(); + env.AddEngine("hne"); + string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train"); + Assert.That(script, Contains.Substring("'trg_lang': 'hne_Deva'")); + } + + [Test] + public async Task CreateJobScriptAsync_ScriptCode() + { + var env = new TestEnvironment(); + env.AddEngine("ks-Arab"); + string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train"); + Assert.That(script, Contains.Substring("'trg_lang': 'kas_Arab'")); + } + + [Test] + public async Task CreateJobScriptAsync_InvalidLangTag() + { + var env = new TestEnvironment(); + env.AddEngine("srp_Cyrl"); + string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train"); + Assert.That(script, Contains.Substring("'trg_lang': 'srp_Cyrl'")); + } + + [Test] + public async Task CreateJobScriptAsync_ChineseNoScript() + { + var env = new TestEnvironment(); + env.AddEngine("zh"); + string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train"); + Assert.That(script, Contains.Substring("'trg_lang': 'zho_Hans'")); + } + + [Test] + public async Task CreateJobScriptAsync_ChineseScript() + { + var env = new TestEnvironment(); + env.AddEngine("zh-Hant"); + string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train"); + Assert.That(script, Contains.Substring("'trg_lang': 'zho_Hant'")); + } + + [Test] + public async Task CreateJobScriptAsync_ChineseRegion() + { + var env = new TestEnvironment(); + env.AddEngine("zh-TW"); + string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train"); + Assert.That(script, Contains.Substring("'trg_lang': 'zho_Hant'")); + } + + [Test] + public async Task CreateJobScriptAsync_MandarinChineseNoScript() + { + var env = new TestEnvironment(); + env.AddEngine("cmn"); + string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train"); + Assert.That(script, Contains.Substring("'trg_lang': 'zho_Hans'")); + } + + [Test] + public async Task CreateJobScriptAsync_MandarinChineseScript() + { + var env = new TestEnvironment(); + env.AddEngine("cmn-Hant"); + string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train"); + Assert.That(script, Contains.Substring("'trg_lang': 'zho_Hant'")); + } + + [Test] + public async Task CreateJobScriptAsync_Macrolanguage() + { + var env = new TestEnvironment(); + env.AddEngine("ms"); + string script = await env.Factory.CreateJobScriptAsync("engine1", "build1", "train"); + Assert.That(script, Contains.Substring("'trg_lang': 'zsm_Latn'")); + } + + private class TestEnvironment + { + public TestEnvironment() + { + if (!Sldr.IsInitialized) + Sldr.Initialize(offlineMode: true); + + Engines = new MemoryRepository(); + + SharedFileService = new SharedFileService(Substitute.For()); + var clearMLOptions = Substitute.For>(); + clearMLOptions.CurrentValue.Returns(new ClearMLOptions()); + Factory = new NmtClearMLBuildJobFactory(SharedFileService, Engines, clearMLOptions); + } + + public NmtClearMLBuildJobFactory Factory { get; } + public MemoryRepository Engines { get; } + public ISharedFileService SharedFileService { get; } + + public void AddEngine(string targetLanguage) + { + Engines.Add( + new TranslationEngine + { + Id = "engine1", + EngineId = "engine1", + SourceLanguage = "en", + TargetLanguage = targetLanguage, + BuildRevision = 1 + } + ); + } + } +}