Skip to content

Commit

Permalink
Update Echo engine to use toolkit
Browse files Browse the repository at this point in the history
Fix bug with pretranslating all; begin porting tests to toolkit
Another small logic fix; update tests to reflect not generating pretranslations for segments with target text
Fix issue with mapping non-parallel-corpora to parallel corpora
Move to service; address scripture alignment issue
  • Loading branch information
Enkidu93 authored and johnml1135 committed Oct 21, 2024
1 parent bdf43fa commit 331657b
Show file tree
Hide file tree
Showing 35 changed files with 865 additions and 606 deletions.
15 changes: 15 additions & 0 deletions Serval.sln
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C3A14577-A65
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit", "src\ServiceToolkit\src\SIL.ServiceToolkit\SIL.ServiceToolkit.csproj", "{0E40F959-C641-40A2-9750-B17A4F9F9E55}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{32B63C4B-AECD-4499-ADFB-69EF581B4F4C}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ServiceToolkit", "ServiceToolkit", "{76123A14-29A5-480D-942E-FE00D6474D50}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit.Tests\SIL.ServiceToolkit.Tests.csproj", "{C50ED15A-876D-42BF-980A-388E8C49C78D}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -180,6 +188,10 @@ Global
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Debug|Any CPU.Build.0 = Debug|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.ActiveCfg = Release|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.Build.0 = Release|Any CPU
{C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -215,6 +227,9 @@ Global
{10657805-48F1-4205-B8F5-79447F6EF620} = {25CDB05B-4E24-4A6E-933E-1E0BEC97D74D}
{C3A14577-A654-4604-818C-4E683DD45A51} = {EA69B41C-49EF-4017-A687-44B9DF37FF98}
{0E40F959-C641-40A2-9750-B17A4F9F9E55} = {C3A14577-A654-4604-818C-4E683DD45A51}
{76123A14-29A5-480D-942E-FE00D6474D50} = {32B63C4B-AECD-4499-ADFB-69EF581B4F4C}
{1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} = {76123A14-29A5-480D-942E-FE00D6474D50}
{C50ED15A-876D-42BF-980A-388E8C49C78D} = {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {9F18C25E-E140-43C3-B177-D562E1628370}
Expand Down
249 changes: 102 additions & 147 deletions src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
namespace EchoTranslationEngine;

public class TranslationEngineServiceV1(BackgroundTaskQueue taskQueue) : TranslationEngineApi.TranslationEngineApiBase
public class TranslationEngineServiceV1(
BackgroundTaskQueue taskQueue,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
) : TranslationEngineApi.TranslationEngineApiBase
{
private static readonly Empty Empty = new();
private readonly BackgroundTaskQueue _taskQueue = taskQueue;

private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService =
parallelCorpusPreprocessingService;

public override Task<CreateResponse> Create(CreateRequest request, ServerCallContext context)
{
if (request.SourceLanguage != request.TargetLanguage)
Expand Down Expand Up @@ -75,159 +81,34 @@ await client.BuildStartedAsync(

try
{
List<InsertPretranslationsRequest> pretranslationsRequests = [];
_parallelCorpusPreprocessingService.Preprocess(
request.Corpora.Select(Map).ToList(),
row => { },
(row, corpus) =>
{
pretranslationsRequests.Add(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = row.TextId,
Refs = { row.Refs.Select(r => r.ToString()) },
Translation = row.SourceSegment
}
);
},
false
);
using (
AsyncClientStreamingCall<InsertPretranslationsRequest, Empty> call =
client.InsertPretranslations(cancellationToken: cancellationToken)
)
{
foreach (ParallelCorpus corpus in request.Corpora)
foreach (InsertPretranslationsRequest request in pretranslationsRequests)
{
var sourceFiles = corpus
.SourceCorpora.SelectMany(sc =>
sc.Files.Where(f =>
(
sc.PretranslateAll
|| sc.PretranslateTextIds is null
|| sc.PretranslateTextIds.Contains(f.TextId)
)
&& f.Format == FileFormat.Text
)
)
.ToDictionary(f => f.TextId, f => f.Location);
var targetFiles = corpus
.TargetCorpora.SelectMany(tc =>
tc.Files.Where(f =>
(
tc.PretranslateAll
|| tc.PretranslateTextIds is null
|| tc.PretranslateTextIds.Contains(f.TextId)
)
&& f.Format == FileFormat.Text
)
)
.ToDictionary(f => f.TextId, f => f.Location);

foreach (KeyValuePair<string, string> sourceFile in sourceFiles)
{
string[] sourceLines = await File.ReadAllLinesAsync(
sourceFile.Value,
cancellationToken
);

if (targetFiles.TryGetValue(sourceFile.Key, out string? targetPath))
{
string[] targetLines = await File.ReadAllLinesAsync(targetPath, cancellationToken);
bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/');
if (!isTabSeparated)
{
int lineNum = 1;
foreach (
(string sourceLine, string targetLine) in sourceLines
.Select(l => l.Trim())
.Zip(targetLines.Select(l => l.Trim()))
)
{
if (sourceLine.Length > 0 && targetLine.Length == 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{lineNum}" },
Translation = sourceLine
},
cancellationToken
);
}
lineNum++;
}
}
else
{
var sourceLinesDict = sourceLines.ToDictionary(
l => l.Split('\t')[0].Trim(),
l => l.Split('\t')[1].Trim()
);
var targetLinesDict = targetLines.ToDictionary(
l => l.Split('\t')[0].Trim(),
l => l.Contains('\t') ? l.Split('\t')[1].Trim() : string.Empty
);
foreach (KeyValuePair<string, string> targetLineKVPair in targetLinesDict)
{
string? sourceLine = null;
sourceLinesDict.TryGetValue(targetLineKVPair.Key, out sourceLine);
sourceLine ??= string.Empty;
string? targetLine = targetLineKVPair.Value;
if (sourceLine.Length > 0 && targetLine.Length == 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{targetLineKVPair.Key}" },
Translation = sourceLine
},
cancellationToken
);
}
}
}
}
else
{
bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/');
if (!isTabSeparated)
{
int lineNum = 1;
foreach (string sourceLine in sourceLines.Select(l => l.Trim()))
{
if (sourceLine.Length > 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{lineNum}" },
Translation = sourceLine
},
cancellationToken
);
}
lineNum++;
}
}
else
{
foreach (string sourceLine in sourceLines.Select(l => l.Trim()))
{
if (sourceLine.Length > 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{sourceLine.Split('\t')[0]}" },
Translation = sourceLine.Contains('\t')
? sourceLine.Split('\t')[1].Trim()
: string.Empty
},
cancellationToken
);
}
}
}
}
}
await call.RequestStream.WriteAsync(request, cancellationToken);
}

await call.RequestStream.CompleteAsync();
await call;
}
Expand Down Expand Up @@ -325,4 +206,78 @@ ServerCallContext context
new GetLanguageInfoResponse { InternalCode = request.Language + "_echo", IsNative = true, }
);
}

private static SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source)
{
return new SIL.ServiceToolkit.Models.ParallelCorpus
{
Id = source.Id,
SourceCorpora = source.SourceCorpora.Select(Map).ToList(),
TargetCorpora = source.TargetCorpora.Select(Map).ToList()
};
}

private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source)
{
var trainOnChapters = source.TrainOnChapters.ToDictionary(
kvp => kvp.Key,
kvp => kvp.Value.Chapters.ToHashSet()
);
var trainOnTextIds = source.TrainOnTextIds.ToHashSet();
FilterChoice trainingFilter = GetFilterChoice(trainOnChapters, trainOnTextIds, source.TrainOnAll);

var pretranslateChapters = source.PretranslateChapters.ToDictionary(
kvp => kvp.Key,
kvp => kvp.Value.Chapters.ToHashSet()
);
var pretranslateTextIds = source.PretranslateTextIds.ToHashSet();
FilterChoice pretranslateFilter = GetFilterChoice(
pretranslateChapters,
pretranslateTextIds,
source.PretranslateAll
);

return new SIL.ServiceToolkit.Models.MonolingualCorpus
{
Id = source.Id,
Language = source.Language,
Files = source.Files.Select(Map).ToList(),
TrainOnChapters = trainingFilter == FilterChoice.Chapters ? trainOnChapters : null,
TrainOnTextIds = trainingFilter == FilterChoice.TextIds ? trainOnTextIds : null,
PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null,
PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null
};
}

private static SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source)
{
return new SIL.ServiceToolkit.Models.CorpusFile
{
Location = source.Location,
Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format,
TextId = source.TextId
};
}

private enum FilterChoice
{
Chapters,
TextIds,
None
}

private static FilterChoice GetFilterChoice(
IReadOnlyDictionary<string, HashSet<int>> chapters,
HashSet<string> textIds,
bool noFilter
)
{
// Only either textIds or Scripture Range will be used at a time
// TextIds may be an empty array, so prefer that if both are empty (which applies to both scripture and text)
if (noFilter || (chapters is null && textIds is null))
return FilterChoice.None;
if (chapters is null || chapters.Count == 0)
return FilterChoice.TextIds;
return FilterChoice.Chapters;
}
}
1 change: 1 addition & 0 deletions src/Echo/src/EchoTranslationEngine/Usings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
global using Grpc.Core;
global using Microsoft.Extensions.Diagnostics.HealthChecks;
global using Serval.Translation.V1;
global using SIL.ServiceToolkit.Utils;
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ public static IMachineBuilder AddBuildJobOptions(this IMachineBuilder builder, I
return builder;
}

public static IMachineBuilder AddServiceToolkitServices(this IMachineBuilder builder)
{
builder.Services.AddParallelCorpusPreprocessor();
return builder;
}

public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder)
{
if (builder.Configuration is null)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf
services.AddTransient<IFileSystem, FileSystem>();

services.AddScoped<IDistributedReaderWriterLockFactory, DistributedReaderWriterLockFactory>();
services.AddSingleton<ICorpusService, CorpusService>();
services.AddStartupTask(
(sp, cancellationToken) =>
sp.GetRequiredService<IDistributedReaderWriterLockFactory>().InitAsync(cancellationToken)
Expand All @@ -42,6 +41,7 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf
configuration.GetSection(DistributedReaderWriterLockOptions.Key)
);
builder.AddBuildJobOptions(configuration.GetSection(BuildJobOptions.Key));
builder.AddServiceToolkitServices();
builder.AddMessageOutboxOptions(configuration.GetSection(MessageOutboxOptions.Key));
}
return builder;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ public class NmtPreprocessBuildJob(
ILogger<NmtPreprocessBuildJob> logger,
IBuildJobService buildJobService,
ISharedFileService sharedFileService,
ICorpusService corpusService,
ILanguageTagService languageTagService
ILanguageTagService languageTagService,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
)
: PreprocessBuildJob(
platformService,
Expand All @@ -17,7 +17,7 @@ ILanguageTagService languageTagService
logger,
buildJobService,
sharedFileService,
corpusService
parallelCorpusPreprocessingService
)
{
private readonly ILanguageTagService _languageTagService = languageTagService;
Expand Down
Loading

0 comments on commit 331657b

Please sign in to comment.