diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs index 9eb7f6bd..3568ed66 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs @@ -196,6 +196,7 @@ public static IMachineBuilder AddMemoryDataAccess(this IMachineBuilder builder) builder.Services.AddMemoryDataAccess(o => { o.AddRepository(); + o.AddRepository(); o.AddRepository(); o.AddRepository(); o.AddRepository(); @@ -232,6 +233,23 @@ await c.Indexes.CreateOrUpdateAsync( ); } ); + o.AddRepository( + "word_alignment_engines", + mapSetup: m => m.SetIgnoreExtraElements(true), + init: async c => + { + await c.Indexes.CreateOrUpdateAsync( + new CreateIndexModel( + Builders.IndexKeys.Ascending(e => e.EngineId) + ) + ); + await c.Indexes.CreateOrUpdateAsync( + new CreateIndexModel( + Builders.IndexKeys.Ascending(e => e.CurrentBuild!.BuildJobRunner) + ) + ); + } + ); o.AddRepository("locks"); o.AddRepository( "train_segment_pairs", diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ClearMLMonitorService.cs b/src/Machine/src/Serval.Machine.Shared/Services/ClearMLMonitorService.cs index 644320aa..e3758cd7 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/ClearMLMonitorService.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/ClearMLMonitorService.cs @@ -48,7 +48,7 @@ private async Task MonitorClearMLTasksPerDomain(IServiceScope scope, Cancellatio try { var translationBuildJobService = scope.ServiceProvider.GetRequiredService< - IBuildJobService + IBuildJobService >(); var wordAlignmentBuildJobService = scope.ServiceProvider.GetRequiredService< IBuildJobService @@ -56,7 +56,7 @@ private async Task MonitorClearMLTasksPerDomain(IServiceScope scope, Cancellatio Dictionary> engineToBuildServiceDict = ( await translationBuildJobService.GetBuildingEnginesAsync(BuildJobRunnerType.ClearML, cancellationToken) - ).ToDictionary(e => e, e => translationBuildJobService); + ).ToDictionary(e => (ITrainingEngine)e, e => (IBuildJobService)translationBuildJobService); foreach ( var engine in await wordAlignmentBuildJobService.GetBuildingEnginesAsync( diff --git a/src/Serval/src/Serval.Client/Client.g.cs b/src/Serval/src/Serval.Client/Client.g.cs index 35105c69..b4ba2685 100644 --- a/src/Serval/src/Serval.Client/Client.g.cs +++ b/src/Serval/src/Serval.Client/Client.g.cs @@ -438,7 +438,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -450,7 +450,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -2410,7 +2410,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -2422,7 +2422,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -3134,7 +3134,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -3146,7 +3146,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -3192,7 +3192,7 @@ public partial interface IDataFilesClient /// /// /// Sample request: - ///
+ ///
///
POST /files ///
{ ///
"format": "text", @@ -3407,7 +3407,7 @@ public string BaseUrl /// /// /// Sample request: - ///
+ ///
///
POST /files ///
{ ///
"format": "text", @@ -4060,7 +4060,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -4072,7 +4072,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -4131,14 +4131,14 @@ public partial interface ITranslationEnginesClient ///
### nmt ///
The Neural Machine Translation engine is primarily used for pretranslations. It is fine-tuned from Meta's NLLB-200. Valid IETF language tags provided to Serval will be converted to [NLLB-200 codes](https://github.com/facebookresearch/flores/tree/main/flores200#languages-in-flores-200). See more about language tag resolution [here](https://github.com/sillsdev/serval/wiki/FLORES%E2%80%90200-Language-Code-Resolution-for-NMT-Engine). ///
* **IsModelPersisted**: (default to false) Whether the model can be downloaded by the client after it has been successfully built. - ///
+ ///
///
If you use a language among NLLB's supported languages, Serval will utilize everything the NLLB-200 model already knows about that language when translating. If the language you are working with is not among NLLB's supported languages, the language code will have no effect. - ///
+ ///
///
Typical endpoints: pretranslate ///
### echo ///
The echo engine has full coverage of all nmt and smt-transfer endpoints. Endpoints like create and build return empty responses. Endpoints like translate and get-word-graph echo the sent content back to the user in a format that mocks nmt or Smt. For example, translating a segment "test" with the echo engine would yield a translation response with translation "test". This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -4368,7 +4368,7 @@ public partial interface ITranslationEnginesClient ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Pretranslations can be filtered by text id if provided. ///
Only pretranslations for the most recent successful build of the engine are returned. ///
@@ -4390,7 +4390,7 @@ public partial interface ITranslationEnginesClient ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Only pretranslations for the most recent successful build of the engine are returned. ///
/// The translation engine id @@ -4410,12 +4410,12 @@ public partial interface ITranslationEnginesClient ///
* `PreferPretranslated`: The existing and pretranslated texts are merged into the USFM, preferring pretranslated text. ///
* `OnlyExisting`: Return the existing target USFM file with no modifications (except updating the USFM id if needed). ///
* `OnlyPretranslated`: Only the pretranslated text is returned; all existing text in the target USFM is removed. - ///
+ ///
///
The source or target book can be used as the USFM template for the pretranslated text. The template can be controlled by the `template` parameter: ///
* `Auto`: The target book is used as the template if it exists; otherwise, the source book is used. **This is the default**. ///
* `Source`: The source book is used as the template. ///
* `Target`: The target book is used as the template. - ///
+ ///
///
Only pretranslations for the most recent successful build of the engine are returned. ///
Both scripture and non-scripture text in the USFM is parsed and grouped according to [this wiki](https://github.com/sillsdev/serval/wiki/USFM-Parsing-and-Translation). /// @@ -4445,30 +4445,30 @@ public partial interface ITranslationEnginesClient ///
Specifying a corpus: ///
* A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId. ///
* A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters. - ///
+ ///
///
Filtering by textID or chapter: ///
* Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
* Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
* All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Filter - train on all or none ///
* If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively ///
* If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively. - ///
+ ///
///
Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, ///
the following text will be pretranslated: ///
* Text segments that are in the source and not the target (untranslated) ///
* Text segments that are in the source and the target, but where that target segment is not trained on. ///
If the engine does not support pretranslation, these fields have no effect. ///
Pretranslating has the same filtering as training. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters. ///
See [smt-transfer job settings documentation](https://github.com/sillsdev/serval/wiki/SMT-Transfer-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. - ///
+ ///
///
When using a parallel corpus: ///
* If, within a single parallel corpus, multiple source corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), those sources will be mixed where they overlap by randomly choosing from each source per line/verse. ///
* If, within a single parallel corpus, multiple target corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), only the first of the targets that includes that textId/book will be used for that textId/book. @@ -4529,10 +4529,10 @@ public partial interface ITranslationEnginesClient /// /// If a Nmt build was successful and IsModelPersisted is `true` for the engine, ///
then the model from the most recent successful build can be downloaded. - ///
+ ///
///
The endpoint will return a URL that can be used to download the model for up to 1 hour ///
after the request is made. If the URL is not used within that time, a new request will need to be made. - ///
+ ///
///
The download itself is created by g-zipping together the folder containing the fine tuned model ///
with all necessary supporting files. This zipped folder is then named by the pattern: ///
* <engine_id>_<model_revision>.tar.gz @@ -4703,14 +4703,14 @@ public string BaseUrl ///
### nmt ///
The Neural Machine Translation engine is primarily used for pretranslations. It is fine-tuned from Meta's NLLB-200. Valid IETF language tags provided to Serval will be converted to [NLLB-200 codes](https://github.com/facebookresearch/flores/tree/main/flores200#languages-in-flores-200). See more about language tag resolution [here](https://github.com/sillsdev/serval/wiki/FLORES%E2%80%90200-Language-Code-Resolution-for-NMT-Engine). ///
* **IsModelPersisted**: (default to false) Whether the model can be downloaded by the client after it has been successfully built. - ///
+ ///
///
If you use a language among NLLB's supported languages, Serval will utilize everything the NLLB-200 model already knows about that language when translating. If the language you are working with is not among NLLB's supported languages, the language code will have no effect. - ///
+ ///
///
Typical endpoints: pretranslate ///
### echo ///
The echo engine has full coverage of all nmt and smt-transfer endpoints. Endpoints like create and build return empty responses. Endpoints like translate and get-word-graph echo the sent content back to the user in a format that mocks nmt or Smt. For example, translating a segment "test" with the echo engine would yield a translation response with translation "test". This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -6717,7 +6717,7 @@ public string BaseUrl ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Pretranslations can be filtered by text id if provided. ///
Only pretranslations for the most recent successful build of the engine are returned. ///
@@ -6851,7 +6851,7 @@ public string BaseUrl ///
* The references defined in the SourceFile per line, if any. ///
* An auto-generated reference of `[TextId]:[lineNumber]`, 1 indexed. ///
* **Translation**: the text of the pretranslation - ///
+ ///
///
Only pretranslations for the most recent successful build of the engine are returned. /// /// The translation engine id @@ -6981,12 +6981,12 @@ public string BaseUrl ///
* `PreferPretranslated`: The existing and pretranslated texts are merged into the USFM, preferring pretranslated text. ///
* `OnlyExisting`: Return the existing target USFM file with no modifications (except updating the USFM id if needed). ///
* `OnlyPretranslated`: Only the pretranslated text is returned; all existing text in the target USFM is removed. - ///
+ ///
///
The source or target book can be used as the USFM template for the pretranslated text. The template can be controlled by the `template` parameter: ///
* `Auto`: The target book is used as the template if it exists; otherwise, the source book is used. **This is the default**. ///
* `Source`: The source book is used as the template. ///
* `Target`: The target book is used as the template. - ///
+ ///
///
Only pretranslations for the most recent successful build of the engine are returned. ///
Both scripture and non-scripture text in the USFM is parsed and grouped according to [this wiki](https://github.com/sillsdev/serval/wiki/USFM-Parsing-and-Translation). /// @@ -7241,30 +7241,30 @@ public string BaseUrl ///
Specifying a corpus: ///
* A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId. ///
* A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters. - ///
+ ///
///
Filtering by textID or chapter: ///
* Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
* Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
* All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Filter - train on all or none ///
* If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively ///
* If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used. ///
* If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively. - ///
+ ///
///
Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, ///
the following text will be pretranslated: ///
* Text segments that are in the source and not the target (untranslated) ///
* Text segments that are in the source and the target, but where that target segment is not trained on. ///
If the engine does not support pretranslation, these fields have no effect. ///
Pretranslating has the same filtering as training. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters. ///
See [smt-transfer job settings documentation](https://github.com/sillsdev/serval/wiki/SMT-Transfer-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. - ///
+ ///
///
When using a parallel corpus: ///
* If, within a single parallel corpus, multiple source corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), those sources will be mixed where they overlap by randomly choosing from each source per line/verse. ///
* If, within a single parallel corpus, multiple target corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), only the first of the targets that includes that textId/book will be used for that textId/book. @@ -7770,10 +7770,10 @@ public string BaseUrl /// /// If a Nmt build was successful and IsModelPersisted is `true` for the engine, ///
then the model from the most recent successful build can be downloaded. - ///
+ ///
///
The endpoint will return a URL that can be used to download the model for up to 1 hour ///
after the request is made. If the URL is not used within that time, a new request will need to be made. - ///
+ ///
///
The download itself is created by g-zipping together the folder containing the fine tuned model ///
with all necessary supporting files. This zipped folder is then named by the pattern: ///
* <engine_id>_<model_revision>.tar.gz @@ -7956,7 +7956,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -7968,7 +7968,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -8368,7 +8368,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -8380,7 +8380,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -8925,7 +8925,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -8937,7 +8937,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -8995,7 +8995,7 @@ public partial interface IWordAlignmentEnginesClient ///
The echo-word-alignment engine has full coverage of all endpoints. Endpoints like create and build return empty responses. ///
Endpoints like get-word-alignment echo the sent content back to the user in the proper format. This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -9111,7 +9111,7 @@ public partial interface IWordAlignmentEnginesClient ///
* **TargetTokens**: the tokenized target segment ///
* **Confidences**: the confidence of the alignment ona scale from 0 to 1 ///
* **Alignment**: the word alignment, 0 indexed for source and target positions - ///
+ ///
///
Word alignments can be filtered by text id if provided. ///
Only word alignments for the most recent successful build of the engine are returned. ///
@@ -9141,10 +9141,10 @@ public partial interface IWordAlignmentEnginesClient ///
Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Specify the corpora or textIds to word align on. ///
When a corpus or textId is selected for word align on, only text segments that are in both the source and the target will be aligned. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [statistical alignment job settings documentation](https://github.com/sillsdev/serval/wiki/Statistical-Alignment-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. @@ -9358,7 +9358,7 @@ public string BaseUrl ///
The echo-word-alignment engine has full coverage of all endpoints. Endpoints like create and build return empty responses. ///
Endpoints like get-word-alignment echo the sent content back to the user in the proper format. This engine is useful for debugging and testing purposes. ///
## Sample request: - ///
+ ///
///
{ ///
"name": "myTeam:myProject:myEngine", ///
"sourceLanguage": "el", @@ -10380,7 +10380,7 @@ public string BaseUrl ///
* **TargetTokens**: the tokenized target segment ///
* **Confidences**: the confidence of the alignment ona scale from 0 to 1 ///
* **Alignment**: the word alignment, 0 indexed for source and target positions - ///
+ ///
///
Word alignments can be filtered by text id if provided. ///
Only word alignments for the most recent successful build of the engine are returned. /// @@ -10617,10 +10617,10 @@ public string BaseUrl ///
Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. - ///
+ ///
///
Specify the corpora or textIds to word align on. ///
When a corpus or textId is selected for word align on, only text segments that are in both the source and the target will be aligned. - ///
+ ///
///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [statistical alignment job settings documentation](https://github.com/sillsdev/serval/wiki/Statistical-Alignment-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. @@ -11191,7 +11191,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -11203,7 +11203,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } @@ -11464,7 +11464,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c var field = System.Reflection.IntrospectionExtensions.GetTypeInfo(value.GetType()).GetDeclaredField(name); if (field != null) { - var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) + var attribute = System.Reflection.CustomAttributeExtensions.GetCustomAttribute(field, typeof(System.Runtime.Serialization.EnumMemberAttribute)) as System.Runtime.Serialization.EnumMemberAttribute; if (attribute != null) { @@ -11476,7 +11476,7 @@ private string ConvertToString(object? value, System.Globalization.CultureInfo c return converted == null ? string.Empty : converted; } } - else if (value is bool) + else if (value is bool) { return System.Convert.ToString((bool)value, cultureInfo).ToLowerInvariant(); } diff --git a/src/Serval/src/Serval.WordAlignment/Configuration/IMongoDataAccessConfiguratorExtensions.cs b/src/Serval/src/Serval.WordAlignment/Configuration/IMongoDataAccessConfiguratorExtensions.cs index 76e6a10f..0b680425 100644 --- a/src/Serval/src/Serval.WordAlignment/Configuration/IMongoDataAccessConfiguratorExtensions.cs +++ b/src/Serval/src/Serval.WordAlignment/Configuration/IMongoDataAccessConfiguratorExtensions.cs @@ -25,7 +25,7 @@ await c.Indexes.CreateOrUpdateAsync( ) ); configurator.AddRepository( - "word_alignment.pretranslations", + "word_alignment.word_alignments", init: async c => { await c.Indexes.CreateOrUpdateAsync( diff --git a/src/Serval/test/Serval.E2ETests/ServalApiTests.cs b/src/Serval/test/Serval.E2ETests/ServalApiTests.cs index 2fb9f86a..e7a89280 100644 --- a/src/Serval/test/Serval.E2ETests/ServalApiTests.cs +++ b/src/Serval/test/Serval.E2ETests/ServalApiTests.cs @@ -146,13 +146,8 @@ public async Task NmtQueueMultiple() const int NUM_WORKERS = 8; string[] engineIds = new string[NUM_ENGINES]; string[] books = ["MAT.txt", "1JN.txt", "2JN.txt"]; - TranslationParallelCorpusConfig train_corpus = await _helperClient.MakeParallelTextCorpus( - books, - "es", - "en", - false - ); - TranslationParallelCorpusConfig pretranslate_corpus = await _helperClient.MakeParallelTextCorpus( + IParallelCorpusConfig train_corpus = await _helperClient.MakeParallelTextCorpus(books, "es", "en", false); + IParallelCorpusConfig pretranslate_corpus = await _helperClient.MakeParallelTextCorpus( ["3JN.txt"], "es", "en", @@ -165,7 +160,7 @@ public async Task NmtQueueMultiple() string engineId = engineIds[i]; await _helperClient.AddParallelTextCorpusToEngineAsync(engineId, train_corpus, false); await _helperClient.AddParallelTextCorpusToEngineAsync(engineId, pretranslate_corpus, true); - await _helperClient.StartBuildAsync(engineId); + await _helperClient.StartTranslationBuildAsync(engineId); //Ensure that tasks are enqueued roughly in order await Task.Delay(1_000); } @@ -227,13 +222,8 @@ public async Task NmtLargeBatchAndDownload() TranslationEngine engine = await _helperClient.TranslationEnginesClient.GetAsync(engineId); Assert.That(engine.IsModelPersisted, Is.True); string[] books = ["bible_LARGEFILE.txt"]; - TranslationParallelCorpusConfig train_corpus = await _helperClient.MakeParallelTextCorpus( - books, - "es", - "en", - false - ); - TranslationParallelCorpusConfig pretranslate_corpus = await _helperClient.MakeParallelTextCorpus( + IParallelCorpusConfig train_corpus = await _helperClient.MakeParallelTextCorpus(books, "es", "en", false); + IParallelCorpusConfig pretranslate_corpus = await _helperClient.MakeParallelTextCorpus( ["3JN.txt"], "es", "en", @@ -370,7 +360,7 @@ public async Task GetSmtCancelAndRestartBuild() async Task StartAndCancelTwice(string engineId) { // start and first job - TranslationBuild build = await _helperClient.StartBuildAsync(engineId); + TranslationBuild build = await _helperClient.StartTranslationBuildAsync(engineId); await Task.Delay(1000); build = await _helperClient.TranslationEnginesClient.GetBuildAsync(engineId, build.Id); Assert.That(build.State == JobState.Active || build.State == JobState.Pending); @@ -381,7 +371,7 @@ async Task StartAndCancelTwice(string engineId) Assert.That(build.State == JobState.Canceled); // do a second job normally and make sure it works. - build = await _helperClient.StartBuildAsync(engineId); + build = await _helperClient.StartTranslationBuildAsync(engineId); await Task.Delay(1000); build = await _helperClient.TranslationEnginesClient.GetBuildAsync(engineId, build.Id); Assert.That(build.State == JobState.Active || build.State == JobState.Pending); @@ -460,6 +450,29 @@ public async Task ParatextProjectNmtJobAsync() Assert.That(usfm, Does.Contain("\\v 1")); } + [Test] + public async Task GetWordAlignment() + { + string engineId = await _helperClient.CreateNewEngineAsync("statistical", "es", "en", "STAT1"); + string[] books = ["1JN.txt", "2JN.txt", "3JN.txt"]; + await _helperClient.AddTextCorpusToEngineAsync(engineId, books, "es", "en", false); + await _helperClient.BuildEngineAsync(engineId); + WordAlignmentResult tResult = await _helperClient.WordAlignmentEnginesClient.GetWordAlignmentAsync( + engineId, + new WordAlignmentRequest() { SourceSegment = "espĂ­ritu verdad", TargetSegment = "spirit truth" } + ); + Assert.That( + tResult.Alignment, + Is.EqualTo( + new List + { + new() { SourceIndex = 0, TargetIndex = 0 }, + new() { SourceIndex = 1, TargetIndex = 1 } + } + ) + ); + } + [TearDown] public async Task TearDown() { diff --git a/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs b/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs index d489cf9a..a3e7945d 100644 --- a/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs +++ b/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs @@ -2,15 +2,47 @@ namespace Serval.E2ETests; #pragma warning disable CS0612 // Type or member is obsolete +public enum EngineGroup +{ + Translation, + WordAlignment +} + +public interface IBuild +{ + string Id { get; set; } + int Revision { get; set; } + JobState State { get; set; } +} + +public interface IParallelCorpus +{ + string Id { get; set; } + string Url { get; set; } + ResourceLink Engine { get; set; } + IList SourceCorpora { get; set; } + IList TargetCorpora { get; set; } +} + +public interface IParallelCorpusConfig +{ + public string Name { get; set; } + public IList SourceCorpusIds { get; set; } + public IList TargetCorpusIds { get; set; } +} + public class ServalClientHelper : IAsyncDisposable { public DataFilesClient DataFilesClient { get; } public CorporaClient CorporaClient { get; } public TranslationEnginesClient TranslationEnginesClient { get; } + public WordAlignmentEnginesClient WordAlignmentEnginesClient { get; } public TranslationEngineTypesClient TranslationEngineTypesClient { get; } public TranslationBuildConfig TranslationBuildConfig { get; set; } + public WordAlignmentBuildConfig WordAlignmentBuildConfig { get; set; } + private IDictionary EngineIdToEngineGroup { get; } = new Dictionary(); private string _authToken = ""; private readonly HttpClient _httpClient; private readonly string _prefix; @@ -40,6 +72,8 @@ public ServalClientHelper(string audience, string prefix = "SCE_", bool ignoreSS TranslationEngineTypesClient = new TranslationEngineTypesClient(_httpClient); _prefix = prefix; TranslationBuildConfig = InitTranslationBuildConfig(); + WordAlignmentEnginesClient = new WordAlignmentEnginesClient(_httpClient); + WordAlignmentBuildConfig = InitWordAlignmentBuildConfig(); } public async Task InitAsync() @@ -65,6 +99,7 @@ public async Task InitAsync() public void Setup() { InitTranslationBuildConfig(); + InitWordAlignmentBuildConfig(); } public TranslationBuildConfig InitTranslationBuildConfig() @@ -87,6 +122,17 @@ public TranslationBuildConfig InitTranslationBuildConfig() return TranslationBuildConfig; } + public WordAlignmentBuildConfig InitWordAlignmentBuildConfig() + { + WordAlignmentBuildConfig = new WordAlignmentBuildConfig + { + WordAlignOn = [], + TrainOn = null, + Options = "" + }; + return WordAlignmentBuildConfig; + } + public async Task ClearEnginesAsync() { IList existingTranslationEngines = await TranslationEnginesClient.GetAllAsync(); @@ -95,48 +141,86 @@ public async Task ClearEnginesAsync() if (translationEngine.Name?.Contains(_prefix) ?? false) await TranslationEnginesClient.DeleteAsync(translationEngine.Id); } + IList existingWordAlignmentEngines = await WordAlignmentEnginesClient.GetAllAsync(); + foreach (WordAlignmentEngine wordAlignmentEngine in existingWordAlignmentEngines) + { + if (wordAlignmentEngine.Name?.Contains(_prefix) ?? false) + await WordAlignmentEnginesClient.DeleteAsync(wordAlignmentEngine.Id); + } } public async Task CreateNewEngineAsync( - string engineTypeString, - string source_language, - string target_language, + string engineType, + string sourceLanguage, + string targetLanguage, string name = "", bool? isModelPersisted = null ) { - TranslationEngine engine = await TranslationEnginesClient.CreateAsync( - new TranslationEngineConfig - { - Name = _prefix + name, - SourceLanguage = source_language, - TargetLanguage = target_language, - Type = engineTypeString, - IsModelPersisted = isModelPersisted - } - ); - return engine.Id; + EngineGroup engineGroup = GetEngineGroup(engineType); + if (engineGroup == EngineGroup.Translation) + { + TranslationEngine engine = await TranslationEnginesClient.CreateAsync( + new TranslationEngineConfig + { + Name = name, + SourceLanguage = sourceLanguage, + TargetLanguage = targetLanguage, + Type = engineGroup.ToString(), + IsModelPersisted = isModelPersisted + } + ); + EngineIdToEngineGroup[engine.Id] = engineGroup; + return engine.Id; + } + else + { + WordAlignmentEngine engine = await WordAlignmentEnginesClient.CreateAsync( + new WordAlignmentEngineConfig + { + Name = name, + SourceLanguage = sourceLanguage, + TargetLanguage = targetLanguage, + Type = engineGroup.ToString(), + } + ); + EngineIdToEngineGroup[engine.Id] = engineGroup; + return engine.Id; + } } - public async Task StartBuildAsync(string engineId) + public async Task StartTranslationBuildAsync(string engineId) { return await TranslationEnginesClient.StartBuildAsync(engineId, TranslationBuildConfig); } public async Task BuildEngineAsync(string engineId) { - TranslationBuild newJob = await StartBuildAsync(engineId); + EngineGroup engineGroup = EngineIdToEngineGroup[engineId]; + IBuild newJob; + if (engineGroup == EngineGroup.Translation) + { + newJob = (IBuild)await StartTranslationBuildAsync(engineId); + } + else + { + newJob = (IBuild)await WordAlignmentEnginesClient.StartBuildAsync(engineId, WordAlignmentBuildConfig); + } int revision = newJob.Revision; await TranslationEnginesClient.GetBuildAsync(engineId, newJob.Id, newJob.Revision); while (true) { try { - TranslationBuild result = await TranslationEnginesClient.GetBuildAsync( - engineId, - newJob.Id, - revision + 1 - ); + IBuild result; + if (engineGroup == EngineGroup.Translation) + { + result = (IBuild)await TranslationEnginesClient.GetBuildAsync(engineId, newJob.Id, revision + 1); + } + else + { + result = (IBuild)await WordAlignmentEnginesClient.GetBuildAsync(engineId, newJob.Id, revision + 1); + } if (!(result.State == JobState.Active || result.State == JobState.Pending)) // build completed break; @@ -155,12 +239,28 @@ public async Task BuildEngineAsync(string engineId) public async Task CancelBuildAsync(string engineId, string buildId, int timeoutSeconds = 20) { - await TranslationEnginesClient.CancelBuildAsync(engineId); + EngineGroup engineGroup = EngineIdToEngineGroup[engineId]; + if (engineGroup == EngineGroup.Translation) + { + await TranslationEnginesClient.CancelBuildAsync(engineId); + } + else + { + await WordAlignmentEnginesClient.CancelBuildAsync(engineId); + } int pollIntervalMs = 1000; int tries = 1; while (true) { - TranslationBuild build = await TranslationEnginesClient.GetBuildAsync(engineId, buildId); + IBuild build; + if (engineGroup == EngineGroup.Translation) + { + build = (IBuild)await TranslationEnginesClient.GetBuildAsync(engineId, buildId); + } + else + { + build = (IBuild)await WordAlignmentEnginesClient.GetBuildAsync(engineId, buildId); + } if (build.State != JobState.Pending && build.State != JobState.Active) break; if (tries++ > timeoutSeconds) @@ -176,13 +276,17 @@ public async Task AddTextCorpusToEngineAsync( string[] filesToAdd, string sourceLanguage, string targetLanguage, - bool pretranslate + bool inference ) { + EngineGroup engineGroup = EngineIdToEngineGroup[engineId]; + if (engineGroup == EngineGroup.WordAlignment) + throw new ArgumentException("Word alignment engines do not support non-parallel corpora."); + List sourceFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, sourceLanguage); var targetFileConfig = new List(); - if (!pretranslate) + if (!inference) { List targetFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, targetLanguage); foreach (var item in targetFiles.Select((file, i) => new { i, file })) @@ -195,7 +299,7 @@ bool pretranslate var sourceFileConfig = new List(); - if (sourceLanguage == targetLanguage && !pretranslate) + if (sourceLanguage == targetLanguage && !inference) { // if it's the same language, and we are not pretranslating, do nothing (echo for suggestions) // if pretranslating, we need to upload the source separately @@ -223,7 +327,7 @@ bool pretranslate } ); - if (pretranslate) + if (inference) { TranslationBuildConfig.Pretranslate!.Add( new PretranslateCorpusConfig { CorpusId = response.Id, TextIds = filesToAdd.ToList() } @@ -233,17 +337,17 @@ bool pretranslate return response.Id; } - public async Task MakeParallelTextCorpus( + public async Task MakeParallelTextCorpus( string[] filesToAdd, string sourceLanguage, string targetLanguage, - bool pretranslate + bool inference ) { List sourceFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, sourceLanguage); var targetFileConfig = new List(); - if (!pretranslate) + if (!inference) { List targetFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, targetLanguage); foreach (var item in targetFiles.Select((file, i) => new { i, file })) @@ -264,7 +368,7 @@ bool pretranslate var sourceFileConfig = new List(); - if (sourceLanguage == targetLanguage && !pretranslate) + if (sourceLanguage == targetLanguage && !inference) { // if it's the same language, and we are not pretranslating, do nothing (echo for suggestions) // if pretranslating, we need to upload the source separately @@ -291,22 +395,44 @@ bool pretranslate TranslationParallelCorpusConfig parallelCorpusConfig = new() { SourceCorpusIds = { sourceCorpus.Id }, TargetCorpusIds = { targetCorpus.Id } }; - return parallelCorpusConfig; + return (IParallelCorpusConfig)parallelCorpusConfig; } public async Task AddParallelTextCorpusToEngineAsync( string engineId, - TranslationParallelCorpusConfig parallelCorpusConfig, - bool pretranslate + IParallelCorpusConfig parallelCorpusConfig, + bool inference ) { - var parallelCorpus = await TranslationEnginesClient.AddParallelCorpusAsync(engineId, parallelCorpusConfig); - - if (pretranslate) + EngineGroup engineGroup = EngineIdToEngineGroup[engineId]; + IParallelCorpus parallelCorpus; + if (engineGroup == EngineGroup.Translation) { - TranslationBuildConfig.Pretranslate!.Add( - new PretranslateCorpusConfig { ParallelCorpusId = parallelCorpus.Id } - ); + parallelCorpus = (IParallelCorpus) + await TranslationEnginesClient.AddParallelCorpusAsync( + engineId, + (TranslationParallelCorpusConfig)parallelCorpusConfig + ); + if (inference) + { + TranslationBuildConfig.Pretranslate!.Add( + new PretranslateCorpusConfig { ParallelCorpusId = parallelCorpus.Id } + ); + } + } + else + { + parallelCorpus = (IParallelCorpus) + await WordAlignmentEnginesClient.AddParallelCorpusAsync( + engineId, + (WordAlignmentParallelCorpusConfig)parallelCorpusConfig + ); + if (inference) + { + WordAlignmentBuildConfig.WordAlignOn!.Add( + new TrainingCorpusConfig2 { ParallelCorpusId = parallelCorpus.Id } + ); + } } return parallelCorpus.Id; @@ -405,6 +531,18 @@ private static HttpClientHandler GetHttHandlerToIgnoreSslErrors() return handler; } + public static EngineGroup GetEngineGroup(string engineType) + { + return engineType switch + { + "SmtTransfer" => EngineGroup.Translation, + "Nmt" => EngineGroup.Translation, + "Echo" => EngineGroup.Translation, + "Statistical" => EngineGroup.WordAlignment, + _ => throw new ArgumentOutOfRangeException(engineType, "Unknown engine type") + }; + } + public async ValueTask TearDown() { if (Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT") != "Development")