From 7b79fbc3a822f5136e45a53159b6bbe3afa3e515 Mon Sep 17 00:00:00 2001 From: "Eli C. Lowry" <83078660+Enkidu93@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:33:23 -0400 Subject: [PATCH] Add missing documentation/fix existing documentation regarding parallel corpora (#507) --- src/Serval/src/Serval.Client/Client.g.cs | 68 +++++++------------ .../TranslationEnginesController.cs | 34 ++++------ 2 files changed, 36 insertions(+), 66 deletions(-) diff --git a/src/Serval/src/Serval.Client/Client.g.cs b/src/Serval/src/Serval.Client/Client.g.cs index e55775e7..c2d3dd6e 100644 --- a/src/Serval/src/Serval.Client/Client.g.cs +++ b/src/Serval/src/Serval.Client/Client.g.cs @@ -4294,25 +4294,12 @@ public partial interface ITranslationEnginesClient /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Add a corpus to a translation engine + /// Add a parallel corpus to a translation engine /// /// /// ## Parameters - ///
* **name**: A name to help identify and distinguish the corpus from other corpora - ///
* The name does not have to be unique since the corpus is uniquely identified by an auto-generated id - ///
* **sourceLanguage**: The source language code (See documentation on endpoint /translation/engines/ - "Create a new translation engine" for details on language codes). - ///
* Normally, this is the same as the engine sourceLanguage. This may change for future engines as a means of transfer learning. - ///
* **targetLanguage**: The target language code (See documentation on endpoint /translation/engines/ - "Create a new translation engine" for details on language codes). - ///
* **SourceFiles**: The source files associated with the corpus - ///
* **FileId**: The unique id referencing the uploaded file - ///
* **TextId**: The client-defined name to associate source and target files. - ///
* If the TextIds in the SourceFiles and TargetFiles match, they will be used to train the engine. - ///
* If selected for pretranslation when building, all SourceFiles that have no TargetFile, or lines of text in a SourceFile that have missing or blank lines in the TargetFile will be pretranslated. - ///
* If a TextId is used more than once in SourceFiles, the sources will be randomly and evenly mixed for training. - ///
* For pretranslating, multiple sources with the same TextId will be combined, but the first source will always take precedence (no random mixing). - ///
* For Paratext projects, TextId will be ignored - multiple Paratext source projects will always be mixed (as if they have the same TextId). - ///
* **TargetFiles**: The target files associated with the corpus - ///
* Same as SourceFiles, except only a single instance of a TextID or a single paratext project is supported. There is no mixing or combining of multiple targets. + ///
* **SourceCorpusIds**: The source corpora associated with the parallel corpus + ///
* **TargetCorpusIds**: The target corpora associated with the parallel corpus ///
/// The translation engine id /// The corpus configuration (see remarks) @@ -4331,13 +4318,13 @@ public partial interface ITranslationEnginesClient /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Update a corpus with a new set of files + /// Update a parallel corpus with a new set of corpora /// /// - /// Will completely replace corpus' file associations. Will not affect jobs already queued or running. Will not affect existing pretranslations until new build is complete. + /// Will completely replace the parallel corpus' file associations. Will not affect jobs already queued or running. Will not affect existing pretranslations until new build is complete. /// /// The translation engine id - /// The corpus id + /// The parallel corpus id /// The corpus configuration /// The corpus was updated successfully /// A server side error occurred. @@ -4450,13 +4437,12 @@ public partial interface ITranslationEnginesClient /// Starts a build job for a translation engine. /// /// - /// Specify the corpora and textIds to train on. If no "trainOn" field is provided, all corpora will be used. - ///
Paratext Projects, you may flag a subset of books for training by including their [abbreviations] + /// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). If no "trainOn" field is provided, all corpora will be used. ///
Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. ///
- ///
Specify the corpora or textIds to pretranslate. When a corpus or textId is selected for pretranslation, + ///
Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, ///
the following text will be pretranslated: ///
* Text segments that are in the source and not the target (untranslated) ///
* Text segments that are in the source and the target, but where that target segment is not trained on. @@ -4466,6 +4452,10 @@ public partial interface ITranslationEnginesClient ///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. + ///
+ ///
When using a parallel corpus: + ///
* If, within a single parallel corpus, multiple source corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), those sources will be mixed where they overlap by randomly choosing from each source per line/verse. + ///
* If, within a single parallel corpus, multiple target corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), only the first of the targets that includes that textId/book will be used for that textId/book. ///
/// The translation engine id /// The build config (see remarks) @@ -6130,25 +6120,12 @@ public string BaseUrl /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Add a corpus to a translation engine + /// Add a parallel corpus to a translation engine /// /// /// ## Parameters - ///
* **name**: A name to help identify and distinguish the corpus from other corpora - ///
* The name does not have to be unique since the corpus is uniquely identified by an auto-generated id - ///
* **sourceLanguage**: The source language code (See documentation on endpoint /translation/engines/ - "Create a new translation engine" for details on language codes). - ///
* Normally, this is the same as the engine sourceLanguage. This may change for future engines as a means of transfer learning. - ///
* **targetLanguage**: The target language code (See documentation on endpoint /translation/engines/ - "Create a new translation engine" for details on language codes). - ///
* **SourceFiles**: The source files associated with the corpus - ///
* **FileId**: The unique id referencing the uploaded file - ///
* **TextId**: The client-defined name to associate source and target files. - ///
* If the TextIds in the SourceFiles and TargetFiles match, they will be used to train the engine. - ///
* If selected for pretranslation when building, all SourceFiles that have no TargetFile, or lines of text in a SourceFile that have missing or blank lines in the TargetFile will be pretranslated. - ///
* If a TextId is used more than once in SourceFiles, the sources will be randomly and evenly mixed for training. - ///
* For pretranslating, multiple sources with the same TextId will be combined, but the first source will always take precedence (no random mixing). - ///
* For Paratext projects, TextId will be ignored - multiple Paratext source projects will always be mixed (as if they have the same TextId). - ///
* **TargetFiles**: The target files associated with the corpus - ///
* Same as SourceFiles, except only a single instance of a TextID or a single paratext project is supported. There is no mixing or combining of multiple targets. + ///
* **SourceCorpusIds**: The source corpora associated with the parallel corpus + ///
* **TargetCorpusIds**: The target corpora associated with the parallel corpus ///
/// The translation engine id /// The corpus configuration (see remarks) @@ -6370,13 +6347,13 @@ public string BaseUrl /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Update a corpus with a new set of files + /// Update a parallel corpus with a new set of corpora /// /// - /// Will completely replace corpus' file associations. Will not affect jobs already queued or running. Will not affect existing pretranslations until new build is complete. + /// Will completely replace the parallel corpus' file associations. Will not affect jobs already queued or running. Will not affect existing pretranslations until new build is complete. /// /// The translation engine id - /// The corpus id + /// The parallel corpus id /// The corpus configuration /// The corpus was updated successfully /// A server side error occurred. @@ -7240,13 +7217,12 @@ public string BaseUrl /// Starts a build job for a translation engine. /// /// - /// Specify the corpora and textIds to train on. If no "trainOn" field is provided, all corpora will be used. - ///
Paratext Projects, you may flag a subset of books for training by including their [abbreviations] + /// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). If no "trainOn" field is provided, all corpora will be used. ///
Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. ///
Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) ///
All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. ///
- ///
Specify the corpora or textIds to pretranslate. When a corpus or textId is selected for pretranslation, + ///
Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, ///
the following text will be pretranslated: ///
* Text segments that are in the source and not the target (untranslated) ///
* Text segments that are in the source and the target, but where that target segment is not trained on. @@ -7256,6 +7232,10 @@ public string BaseUrl ///
The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. ///
See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters. ///
See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. + ///
+ ///
When using a parallel corpus: + ///
* If, within a single parallel corpus, multiple source corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), those sources will be mixed where they overlap by randomly choosing from each source per line/verse. + ///
* If, within a single parallel corpus, multiple target corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), only the first of the targets that includes that textId/book will be used for that textId/book. ///
/// The translation engine id /// The build config (see remarks) diff --git a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs index cd18dc82..679ecbc2 100644 --- a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs +++ b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs @@ -511,25 +511,12 @@ CancellationToken cancellationToken } /// - /// Add a corpus to a translation engine + /// Add a parallel corpus to a translation engine /// /// /// ## Parameters - /// * **name**: A name to help identify and distinguish the corpus from other corpora - /// * The name does not have to be unique since the corpus is uniquely identified by an auto-generated id - /// * **sourceLanguage**: The source language code (See documentation on endpoint /translation/engines/ - "Create a new translation engine" for details on language codes). - /// * Normally, this is the same as the engine sourceLanguage. This may change for future engines as a means of transfer learning. - /// * **targetLanguage**: The target language code (See documentation on endpoint /translation/engines/ - "Create a new translation engine" for details on language codes). - /// * **SourceFiles**: The source files associated with the corpus - /// * **FileId**: The unique id referencing the uploaded file - /// * **TextId**: The client-defined name to associate source and target files. - /// * If the TextIds in the SourceFiles and TargetFiles match, they will be used to train the engine. - /// * If selected for pretranslation when building, all SourceFiles that have no TargetFile, or lines of text in a SourceFile that have missing or blank lines in the TargetFile will be pretranslated. - /// * If a TextId is used more than once in SourceFiles, the sources will be randomly and evenly mixed for training. - /// * For pretranslating, multiple sources with the same TextId will be combined, but the first source will always take precedence (no random mixing). - /// * For Paratext projects, TextId will be ignored - multiple Paratext source projects will always be mixed (as if they have the same TextId). - /// * **TargetFiles**: The target files associated with the corpus - /// * Same as SourceFiles, except only a single instance of a TextID or a single paratext project is supported. There is no mixing or combining of multiple targets. + /// * **SourceCorpusIds**: The source corpora associated with the parallel corpus + /// * **TargetCorpusIds**: The target corpora associated with the parallel corpus /// /// The translation engine id /// The corpus configuration (see remarks) @@ -572,13 +559,13 @@ CancellationToken cancellationToken } /// - /// Update a corpus with a new set of files + /// Update a parallel corpus with a new set of corpora /// /// - /// Will completely replace corpus' file associations. Will not affect jobs already queued or running. Will not affect existing pretranslations until new build is complete. + /// Will completely replace the parallel corpus' file associations. Will not affect jobs already queued or running. Will not affect existing pretranslations until new build is complete. /// /// The translation engine id - /// The corpus id + /// The parallel corpus id /// The corpus configuration /// The data file client /// @@ -1003,13 +990,12 @@ CancellationToken cancellationToken /// Starts a build job for a translation engine. /// /// - /// Specify the corpora and textIds to train on. If no "trainOn" field is provided, all corpora will be used. - /// Paratext Projects, you may flag a subset of books for training by including their [abbreviations] + /// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). If no "trainOn" field is provided, all corpora will be used. /// Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. /// Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) /// All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. /// - /// Specify the corpora or textIds to pretranslate. When a corpus or textId is selected for pretranslation, + /// Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, /// the following text will be pretranslated: /// * Text segments that are in the source and not the target (untranslated) /// * Text segments that are in the source and the target, but where that target segment is not trained on. @@ -1019,6 +1005,10 @@ CancellationToken cancellationToken /// The `"options"` parameter of the build config provides the ability to pass build configuration parameters as a JSON object. /// See [nmt job settings documentation](https://github.com/sillsdev/serval/wiki/NMT-Build-Options) about configuring job parameters. /// See [keyterms parsing documentation](https://github.com/sillsdev/serval/wiki/Paratext-Key-Terms-Parsing) on how to use keyterms for training. + /// + /// When using a parallel corpus: + /// * If, within a single parallel corpus, multiple source corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), those sources will be mixed where they overlap by randomly choosing from each source per line/verse. + /// * If, within a single parallel corpus, multiple target corpora have data for the same textIds (for text files or Paratext Projects) or books (for Paratext Projects only using the scriptureRange), only the first of the targets that includes that textId/book will be used for that textId/book. /// /// The translation engine id /// The build config (see remarks)