Skip to content

Commit

Permalink
Very broken - a concept
Browse files Browse the repository at this point in the history
  • Loading branch information
johnml1135 committed Sep 20, 2024
1 parent f162914 commit 06b1173
Show file tree
Hide file tree
Showing 59 changed files with 420 additions and 350 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
namespace Serval.Machine.Shared.Models;

public record Corpus
public record TranslationCorpus
{
public required string Id { get; init; }
public required string SourceLanguage { get; init; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Task StartBuildAsync(
string engineId,
string buildId,
string? buildOptions,
IReadOnlyList<Corpus> corpora,
IReadOnlyList<TranslationCorpus> corpora,
CancellationToken cancellationToken = default
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public async Task StartBuildAsync(
string engineId,
string buildId,
string? buildOptions,
IReadOnlyList<Corpus> corpora,
IReadOnlyList<TranslationCorpus> corpora,
CancellationToken cancellationToken = default
)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ public Job CreateJob(string engineId, string buildId, BuildStage stage, object?
return stage switch
{
BuildStage.Preprocess
=> CreateJob<NmtPreprocessBuildJob, IReadOnlyList<Corpus>>(
=> CreateJob<NmtPreprocessBuildJob, IReadOnlyList<TranslationCorpus>>(
engineId,
buildId,
"nmt",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
namespace Serval.Machine.Shared.Services;

public class PreprocessBuildJob : HangfireBuildJob<IReadOnlyList<Corpus>>
public class PreprocessBuildJob : HangfireBuildJob<IReadOnlyList<TranslationCorpus>>
{
private static readonly JsonWriterOptions PretranslateWriterOptions = new() { Indented = true };

Expand Down Expand Up @@ -43,7 +43,7 @@ internal int Seed
protected override async Task DoWorkAsync(
string engineId,
string buildId,
IReadOnlyList<Corpus> data,
IReadOnlyList<TranslationCorpus> data,
string? buildOptions,
CancellationToken cancellationToken
)
Expand Down Expand Up @@ -99,7 +99,7 @@ CancellationToken cancellationToken

private async Task<(int TrainCount, int PretranslateCount)> WriteDataFilesAsync(
string buildId,
IReadOnlyList<Corpus> corpora,
IReadOnlyList<TranslationCorpus> corpora,
string? buildOptions,
CancellationToken cancellationToken
)
Expand All @@ -121,7 +121,7 @@ CancellationToken cancellationToken
int trainCount = 0;
int pretranslateCount = 0;
pretranslateWriter.WriteStartArray();
foreach (Corpus corpus in corpora)
foreach (TranslationCorpus corpus in corpora)
{
ITextCorpus[] sourceTextCorpora = _corpusService.CreateTextCorpora(corpus.SourceFiles).ToArray();
ITextCorpus targetTextCorpus =
Expand Down Expand Up @@ -204,7 +204,7 @@ CancellationToken cancellationToken
protected override async Task CleanupAsync(
string engineId,
string buildId,
IReadOnlyList<Corpus> data,
IReadOnlyList<TranslationCorpus> data,
JobCompletionStatus completionStatus
)
{
Expand All @@ -221,12 +221,12 @@ JobCompletionStatus completionStatus
}
}

private static bool IsInTrain(Row row, Corpus corpus)
private static bool IsInTrain(Row row, TranslationCorpus corpus)
{
return IsIncluded(row, corpus.TrainOnTextIds, corpus.TrainOnChapters);
}

private static bool IsInPretranslate(Row row, Corpus corpus)
private static bool IsInPretranslate(Row row, TranslationCorpus corpus)
{
return IsIncluded(row, corpus.PretranslateTextIds, corpus.PretranslateChapters);
}
Expand Down Expand Up @@ -255,7 +255,7 @@ private static bool IsInChapters(IReadOnlyDictionary<string, HashSet<int>> bookC
}

private static IEnumerable<Row?[]> AlignTrainCorpus(
Corpus corpus,
TranslationCorpus corpus,
IReadOnlyList<ITextCorpus> srcCorpora,
ITextCorpus trgCorpus
)
Expand Down Expand Up @@ -379,7 +379,11 @@ ITextCorpus trgCorpus
}
}

private static IEnumerable<Row> AlignPretranslateCorpus(Corpus corpus, ITextCorpus srcCorpus, ITextCorpus trgCorpus)
private static IEnumerable<Row> AlignPretranslateCorpus(
TranslationCorpus corpus,
ITextCorpus srcCorpus,
ITextCorpus trgCorpus
)
{
IEnumerable<string>? textIds = corpus.PretranslateChapters is not null
? corpus.PretranslateChapters.Keys
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ await engineService.TrainSegmentPairAsync(
public override async Task<Empty> StartBuild(StartBuildRequest request, ServerCallContext context)
{
ITranslationEngineService engineService = GetEngineService(request.EngineType);
Models.Corpus[] corpora = request.Corpora.Select(Map).ToArray();
Models.TranslationCorpus[] corpora = request.Corpora.Select(Map).ToArray();
try
{
await engineService.StartBuildAsync(
Expand Down Expand Up @@ -269,7 +269,7 @@ private static Translation.V1.Phrase Map(SIL.Machine.Translation.Phrase source)
};
}

private static Models.Corpus Map(Translation.V1.Corpus source)
private static Models.TranslationCorpus Map(Translation.V1.Corpus source)
{
var pretranslateChapters = source.PretranslateChapters.ToDictionary(
kvp => kvp.Key,
Expand All @@ -283,7 +283,7 @@ private static Models.Corpus Map(Translation.V1.Corpus source)
);
FilterChoice trainingFilter = GetFilterChoice(source.TrainOnAll, trainOnChapters);

return new Models.Corpus
return new Models.TranslationCorpus
{
Id = source.Id,
SourceLanguage = source.SourceLanguage,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ public async Task StartBuildAsync(
string engineId,
string buildId,
string? buildOptions,
IReadOnlyList<Corpus> corpora,
IReadOnlyList<TranslationCorpus> corpora,
CancellationToken cancellationToken = default
)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ public Job CreateJob(string engineId, string buildId, BuildStage stage, object?
return stage switch
{
BuildStage.Preprocess
=> CreateJob<SmtTransferPreprocessBuildJob, IReadOnlyList<Corpus>>(
=> CreateJob<SmtTransferPreprocessBuildJob, IReadOnlyList<TranslationCorpus>>(
engineId,
buildId,
"smt_transfer",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ IRepository<TrainSegmentPair> trainSegmentPairs
protected override async Task InitializeAsync(
string engineId,
string buildId,
IReadOnlyList<Corpus> data,
IReadOnlyList<TranslationCorpus> data,
CancellationToken cancellationToken
)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public async Task StartBuildAsync()
using var env = new TestEnvironment();
TranslationEngine engine = env.Engines.Get("engine1");
Assert.That(engine.BuildRevision, Is.EqualTo(1));
await env.Service.StartBuildAsync("engine1", "build1", "{}", Array.Empty<Corpus>());
await env.Service.StartBuildAsync("engine1", "build1", "{}", Array.Empty<TranslationCorpus>());
await env.WaitForBuildToFinishAsync();
engine = env.Engines.Get("engine1");
Assert.Multiple(() =>
Expand All @@ -28,7 +28,7 @@ public async Task CancelBuildAsync_Building()

TranslationEngine engine = env.Engines.Get("engine1");
Assert.That(engine.BuildRevision, Is.EqualTo(1));
await env.Service.StartBuildAsync("engine1", "build1", "{}", Array.Empty<Corpus>());
await env.Service.StartBuildAsync("engine1", "build1", "{}", Array.Empty<TranslationCorpus>());
await env.WaitForBuildToStartAsync();
engine = env.Engines.Get("engine1");
Assert.That(engine.CurrentBuild, Is.Not.Null);
Expand All @@ -55,7 +55,7 @@ public async Task DeleteAsync_WhileBuilding()

TranslationEngine engine = env.Engines.Get("engine1");
Assert.That(engine.BuildRevision, Is.EqualTo(1));
await env.Service.StartBuildAsync("engine1", "build1", "{}", Array.Empty<Corpus>());
await env.Service.StartBuildAsync("engine1", "build1", "{}", Array.Empty<TranslationCorpus>());
await env.WaitForBuildToStartAsync();
engine = env.Engines.Get("engine1");
Assert.That(engine.CurrentBuild, Is.Not.Null);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ public class PreprocessBuildJobTests
public async Task RunAsync_FilterOutEverything()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultTextFileCorpus with { };
TranslationCorpus corpus1 = env.DefaultTextFileCorpus with { };

await env.RunBuildJobAsync(corpus1);

Expand All @@ -25,7 +25,7 @@ public async Task RunAsync_FilterOutEverything()
public async Task RunAsync_TrainOnAll()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultTextFileCorpus with { TrainOnTextIds = null };
TranslationCorpus corpus1 = env.DefaultTextFileCorpus with { TrainOnTextIds = null };

await env.RunBuildJobAsync(corpus1);

Expand All @@ -43,7 +43,7 @@ public async Task RunAsync_TrainOnAll()
public async Task RunAsync_TrainOnTextIds()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultTextFileCorpus with { TrainOnTextIds = ["textId1"] };
TranslationCorpus corpus1 = env.DefaultTextFileCorpus with { TrainOnTextIds = ["textId1"] };

await env.RunBuildJobAsync(corpus1);

Expand All @@ -61,7 +61,11 @@ public async Task RunAsync_TrainOnTextIds()
public async Task RunAsync_TrainAndPretranslateAll()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultTextFileCorpus with { PretranslateTextIds = null, TrainOnTextIds = null };
TranslationCorpus corpus1 = env.DefaultTextFileCorpus with
{
PretranslateTextIds = null,
TrainOnTextIds = null
};

await env.RunBuildJobAsync(corpus1);

Expand All @@ -72,7 +76,7 @@ public async Task RunAsync_TrainAndPretranslateAll()
public async Task RunAsync_PretranslateAll()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultTextFileCorpus with { PretranslateTextIds = null };
TranslationCorpus corpus1 = env.DefaultTextFileCorpus with { PretranslateTextIds = null };

await env.RunBuildJobAsync(corpus1);

Expand All @@ -83,7 +87,11 @@ public async Task RunAsync_PretranslateAll()
public async Task RunAsync_PretranslateTextIds()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultTextFileCorpus with { PretranslateTextIds = ["textId1"], TrainOnTextIds = null };
TranslationCorpus corpus1 = env.DefaultTextFileCorpus with
{
PretranslateTextIds = ["textId1"],
TrainOnTextIds = null
};

await env.RunBuildJobAsync(corpus1);

Expand All @@ -94,7 +102,7 @@ public async Task RunAsync_PretranslateTextIds()
public async Task RunAsync_EnableKeyTerms()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultParatextCorpus with { };
TranslationCorpus corpus1 = env.DefaultParatextCorpus with { };

await env.RunBuildJobAsync(corpus1, useKeyTerms: true);

Expand All @@ -112,7 +120,7 @@ public async Task RunAsync_EnableKeyTerms()
public async Task RunAsync_DisableKeyTerms()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultParatextCorpus with { };
TranslationCorpus corpus1 = env.DefaultParatextCorpus with { };

await env.RunBuildJobAsync(corpus1, useKeyTerms: false);

Expand All @@ -130,7 +138,7 @@ public async Task RunAsync_DisableKeyTerms()
public async Task RunAsync_PretranslateChapters()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultParatextCorpus with
TranslationCorpus corpus1 = env.DefaultParatextCorpus with
{
PretranslateChapters = new Dictionary<string, HashSet<int>>
{
Expand All @@ -150,7 +158,7 @@ public async Task RunAsync_PretranslateChapters()
public async Task RunAsync_TrainOnChapters()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultParatextCorpus with
TranslationCorpus corpus1 = env.DefaultParatextCorpus with
{
TrainOnChapters = new Dictionary<string, HashSet<int>>
{
Expand All @@ -177,7 +185,7 @@ public async Task RunAsync_TrainOnChapters()
public async Task RunAsync_MixedSource_Paratext()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultMixedSourceParatextCorpus with
TranslationCorpus corpus1 = env.DefaultMixedSourceParatextCorpus with
{
TrainOnTextIds = null,
PretranslateTextIds = null
Expand All @@ -200,7 +208,7 @@ public async Task RunAsync_MixedSource_Paratext()
public async Task RunAsync_MixedSource_Text()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultMixedSourceTextFileCorpus with
TranslationCorpus corpus1 = env.DefaultMixedSourceTextFileCorpus with
{
TrainOnTextIds = null,
PretranslateTextIds = null,
Expand All @@ -225,7 +233,7 @@ public async Task RunAsync_MixedSource_Text()
public void RunAsync_UnknownLanguageTagsNoData()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultTextFileCorpus with { SourceLanguage = "xxx", TargetLanguage = "zzz" };
TranslationCorpus corpus1 = env.DefaultTextFileCorpus with { SourceLanguage = "xxx", TargetLanguage = "zzz" };

Assert.ThrowsAsync<InvalidOperationException>(async () =>
{
Expand All @@ -237,7 +245,7 @@ public void RunAsync_UnknownLanguageTagsNoData()
public async Task RunAsync_UnknownLanguageTagsNoDataSmtTransfer()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultTextFileCorpus with { SourceLanguage = "xxx", TargetLanguage = "zzz" };
TranslationCorpus corpus1 = env.DefaultTextFileCorpus with { SourceLanguage = "xxx", TargetLanguage = "zzz" };

await env.RunBuildJobAsync(corpus1, engineId: "engine2", engineType: TranslationEngineType.SmtTransfer);
}
Expand All @@ -246,7 +254,7 @@ public async Task RunAsync_UnknownLanguageTagsNoDataSmtTransfer()
public async Task RunAsync_RemoveFreestandingEllipses()
{
using TestEnvironment env = new();
Corpus corpus1 = env.DefaultParatextCorpus with
TranslationCorpus corpus1 = env.DefaultParatextCorpus with
{
TrainOnChapters = new Dictionary<string, HashSet<int>>
{
Expand Down Expand Up @@ -286,7 +294,7 @@ public async Task RunAsync_RemoveFreestandingEllipses()
public void RunAsync_OnlyParseSelectedBooks_NoBadBooks()
{
using TestEnvironment env = new();
Corpus corpus = env.DefaultParatextCorpus with
TranslationCorpus corpus = env.DefaultParatextCorpus with
{
TrainOnTextIds = new() { "LEV" },
PretranslateTextIds = new() { "MRK" }
Expand All @@ -310,7 +318,7 @@ public void RunAsync_OnlyParseSelectedBooks_NoBadBooks()
public void RunAsync_OnlyParseSelectedBooks_TrainOnBadBook()
{
using TestEnvironment env = new();
Corpus corpus = env.DefaultParatextCorpus with
TranslationCorpus corpus = env.DefaultParatextCorpus with
{
TrainOnTextIds = new() { "MAT" },
PretranslateTextIds = new() { "MRK" }
Expand All @@ -333,7 +341,7 @@ public void RunAsync_OnlyParseSelectedBooks_TrainOnBadBook()
public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook()
{
using TestEnvironment env = new();
Corpus corpus = env.DefaultParatextCorpus with
TranslationCorpus corpus = env.DefaultParatextCorpus with
{
TrainOnTextIds = new() { "LEV" },
PretranslateTextIds = new() { "MAT" }
Expand Down Expand Up @@ -375,10 +383,10 @@ private class TestEnvironment : DisposableBase
public IClearMLService ClearMLService { get; }
public IOptionsMonitor<BuildJobOptions> BuildJobOptions { get; }

public Corpus DefaultTextFileCorpus { get; }
public Corpus DefaultMixedSourceTextFileCorpus { get; }
public Corpus DefaultParatextCorpus { get; }
public Corpus DefaultMixedSourceParatextCorpus { get; }
public TranslationCorpus DefaultTextFileCorpus { get; }
public TranslationCorpus DefaultMixedSourceTextFileCorpus { get; }
public TranslationCorpus DefaultParatextCorpus { get; }
public TranslationCorpus DefaultMixedSourceParatextCorpus { get; }

public TestEnvironment()
{
Expand Down Expand Up @@ -614,7 +622,7 @@ public PreprocessBuildJob GetBuildJob(TranslationEngineType engineType)
}

public Task RunBuildJobAsync(
Corpus corpus,
TranslationCorpus corpus,
bool useKeyTerms = true,
string engineId = "engine1",
TranslationEngineType engineType = TranslationEngineType.Nmt
Expand Down
Loading

0 comments on commit 06b1173

Please sign in to comment.