Skip to content

Commit

Permalink
Pretranslate even if there isn't a target corpus.
Browse files Browse the repository at this point in the history
Put "TrainOnAll" and "PretranslateAll" back in
Don't do https redirection
  • Loading branch information
johnml1135 committed Oct 11, 2024
1 parent 261dde1 commit 43fd395
Show file tree
Hide file tree
Showing 8 changed files with 166 additions and 22 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
build:
name: Build
runs-on: ubuntu-latest
timeout-minutes: 45
timeout-minutes: 60

env:
SERVAL_CLIENT_ID: ${{ secrets.SERVAL_CLIENT_ID }}
Expand Down
3 changes: 0 additions & 3 deletions src/Echo/src/EchoTranslationEngine/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@

WebApplication app = builder.Build();

// Configure the HTTP request pipeline.
app.UseHttpsRedirection();

app.MapGrpcService<TranslationEngineServiceV1>();
app.MapGrpcService<HealthServiceV1>();

Expand Down
2 changes: 0 additions & 2 deletions src/Machine/src/Serval.Machine.EngineServer/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@

var app = builder.Build();

app.UseHttpsRedirection();

app.MapServalTranslationEngineService();
app.MapHangfireDashboard();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,21 +238,33 @@ row.Ref is not ScriptureRef sr
}
}
}

foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpora[0].TextCorpus))
void WriteRow(Utf8JsonWriter writer, string textId, IReadOnlyList<object> refs, string translation)
{
writer.WriteStartObject();
writer.WriteString("corpusId", corpus.Id);
writer.WriteString("textId", textId);
writer.WriteStartArray("refs");
foreach (object rowRef in refs)
writer.WriteStringValue(rowRef.ToString());
writer.WriteEndArray();
writer.WriteString("translation", translation);
writer.WriteEndObject();
pretranslateCount++;
}
if (targetCorpora.Length == 0)
{
foreach (Row row in GetPretranslateCorpusNoTarget(sourcePretranslateCorpora))
{
if (row.SourceSegment.Length > 0)
WriteRow(pretranslateWriter, row.TextId, row.Refs, row.SourceSegment);
}
}
else
{
if (row.SourceSegment.Length > 0)
foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpora[0].TextCorpus))
{
pretranslateWriter.WriteStartObject();
pretranslateWriter.WriteString("corpusId", corpus.Id);
pretranslateWriter.WriteString("textId", row.TextId);
pretranslateWriter.WriteStartArray("refs");
foreach (object rowRef in row.Refs)
pretranslateWriter.WriteStringValue(rowRef.ToString());
pretranslateWriter.WriteEndArray();
pretranslateWriter.WriteString("translation", row.SourceSegment);
pretranslateWriter.WriteEndObject();
pretranslateCount++;
if (row.SourceSegment.Length > 0)
WriteRow(pretranslateWriter, row.TextId, row.Refs, row.SourceSegment);
}
}
}
Expand Down Expand Up @@ -454,6 +466,47 @@ private static IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus[] srcCorpora
yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1);
}

private static IEnumerable<Row> GetPretranslateCorpusNoTarget(ITextCorpus[] srcCorpora)
{
int rowCount = 0;
StringBuilder srcSegBuffer = new();
List<object> refs = [];
string textId = "";
foreach (TextRow row in srcCorpora.SelectMany(sc => sc))
{
if (!row.IsRangeStart && row.IsInRange)
{
refs.Add(row.Ref);
if (row.Segment.Count > 0)
{
if (srcSegBuffer.Length > 0)
srcSegBuffer.Append(' ');
srcSegBuffer.Append(string.Join(" ", row.Segment));
}
rowCount++;
}
else
{
if (rowCount > 0)
{
yield return new(textId, refs, srcSegBuffer.ToString(), "", 1);
textId = "";
srcSegBuffer.Clear();
refs.Clear();
rowCount = 0;
}

textId = row.TextId;
refs.Add(row.Ref);
srcSegBuffer.Append(string.Join(" ", row.Segment));
rowCount++;
}
}

if (rowCount > 0)
yield return new(textId, refs, srcSegBuffer.ToString(), "", 1);
}

private record Row(
string TextId,
IReadOnlyList<object> Refs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus sou
var pretranslateTextIds = source.PretranslateTextIds.ToHashSet();
FilterChoice pretranslateFilter = GetFilterChoice(pretranslateChapters, pretranslateTextIds);

return new Models.MonolingualCorpus
var corpus = new Models.MonolingualCorpus
{
Id = source.Id,
Language = source.Language,
Expand All @@ -305,6 +305,17 @@ private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus sou
PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null,
PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null
};
if (source.PretranslateAll)
{
corpus.PretranslateChapters = null;
corpus.PretranslateTextIds = null;
}
if (source.TrainOnAll)
{
corpus.TrainOnChapters = null;
corpus.TrainOnTextIds = null;
}
return corpus;
}

private static Models.CorpusFile Map(Translation.V1.CorpusFile source)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ message ParallelCorpus {
message MonolingualCorpus {
string id = 1;
string language = 2;
bool train_on_all = 3;
bool pretranslate_all = 4;
map<string, ScriptureChapters> train_on_chapters = 5;
map<string, ScriptureChapters> pretranslate_chapters = 6;
repeated string train_on_text_ids = 7;
Expand Down
14 changes: 12 additions & 2 deletions src/Serval/src/Serval.Translation/Services/EngineService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,12 @@ private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, Pre
V1.MonolingualCorpus targetCorpus =
new() { Language = source.TargetLanguage, Files = { source.TargetFiles.Select(Map) } };

if (trainingCorpus != null)
if (trainingCorpus == null)
{
sourceCorpus.TrainOnAll = true;
targetCorpus.TrainOnAll = true;
}
else
{
if (trainingCorpus.TextIds is not null && trainingCorpus.ScriptureRange is not null)
{
Expand Down Expand Up @@ -636,7 +641,12 @@ private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, Pre
targetCorpus.TrainOnChapters.Add(chapters);
}
}
if (pretranslateCorpus != null)
if (pretranslateCorpus == null)
{
sourceCorpus.PretranslateAll = true;
targetCorpus.PretranslateAll = true;
}
else
{
if (pretranslateCorpus.TextIds is not null && pretranslateCorpus.ScriptureRange is not null)
{
Expand Down
73 changes: 73 additions & 0 deletions src/Serval/test/Serval.E2ETests/ServalClientHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ namespace Serval.E2ETests;
public class ServalClientHelper : IAsyncDisposable
{
public DataFilesClient DataFilesClient { get; }
public CorporaClient CorporaClient { get; }
public TranslationEnginesClient TranslationEnginesClient { get; }
public TranslationEngineTypesClient TranslationEngineTypesClient { get; }

Expand Down Expand Up @@ -32,6 +33,7 @@ public ServalClientHelper(string audience, string prefix = "SCE_", bool ignoreSS
_httpClient.BaseAddress = new Uri(hostUrl);
_httpClient.Timeout = TimeSpan.FromSeconds(60);
DataFilesClient = new DataFilesClient(_httpClient);
CorporaClient = new CorporaClient(_httpClient);
TranslationEnginesClient = new TranslationEnginesClient(_httpClient);
TranslationEngineTypesClient = new TranslationEngineTypesClient(_httpClient);
_prefix = prefix;
Expand Down Expand Up @@ -229,6 +231,77 @@ bool pretranslate
return response.Id;
}

public async Task<string> AddParallelTextCorpusToEngineAsync(
string engineId,
string[] filesToAdd,
string sourceLanguage,
string targetLanguage,
bool pretranslate
)
{
List<DataFile> sourceFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, sourceLanguage);

var targetFileConfig = new List<CorpusFileConfig>();
if (!pretranslate)
{
List<DataFile> targetFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, targetLanguage);
foreach (var item in targetFiles.Select((file, i) => new { i, file }))
{
targetFileConfig.Add(new CorpusFileConfig { FileId = item.file.Id, TextId = filesToAdd[item.i] });
}
}

CorpusConfig targetCorpusConfig =
new()
{
Name = "None",
Language = targetLanguage,
Files = targetFileConfig
};

var targetCorpus = await CorporaClient.CreateAsync(targetCorpusConfig);

var sourceFileConfig = new List<CorpusFileConfig>();

if (sourceLanguage == targetLanguage && !pretranslate)
{
// if it's the same language, and we are not pretranslating, do nothing (echo for suggestions)
// if pretranslating, we need to upload the source separately
// if different languages, we are not echoing.
}
else
{
for (int i = 0; i < sourceFiles.Count; i++)
{
sourceFileConfig.Add(new CorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] });
}
}

CorpusConfig sourceCorpusConfig =
new()
{
Name = "None",
Language = sourceLanguage,
Files = sourceFileConfig
};

var sourceCorpus = await CorporaClient.CreateAsync(sourceCorpusConfig);

TranslationParallelCorpusConfig parallelCorpusConfig =
new() { SourceCorpusIds = { sourceCorpus.Id }, TargetCorpusIds = { targetCorpus.Id } };

var parallelCorpus = await TranslationEnginesClient.AddParallelCorpusAsync(engineId, parallelCorpusConfig);

if (pretranslate)
{
TranslationBuildConfig.Pretranslate!.Add(
new PretranslateCorpusConfig { CorpusId = parallelCorpus.Id, TextIds = filesToAdd.ToList() }
);
}

return parallelCorpus.Id;
}

public async Task<List<DataFile>> UploadFilesAsync(
IEnumerable<string> filesToAdd,
FileFormat fileFormat,
Expand Down

0 comments on commit 43fd395

Please sign in to comment.