Skip to content

Commit

Permalink
Merge branch 'move_preprocess_logic_to_toolkit' of https://github.com…
Browse files Browse the repository at this point in the history
…/sillsdev/serval into move_preprocess_logic_to_toolkit
  • Loading branch information
Enkidu93 committed Nov 6, 2024
2 parents 2768b4c + 891f067 commit f950d6e
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 39 deletions.
9 changes: 2 additions & 7 deletions Serval.sln
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,9 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C3A14577-A65
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit", "src\ServiceToolkit\src\SIL.ServiceToolkit\SIL.ServiceToolkit.csproj", "{0E40F959-C641-40A2-9750-B17A4F9F9E55}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{32B63C4B-AECD-4499-ADFB-69EF581B4F4C}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ServiceToolkit", "ServiceToolkit", "{76123A14-29A5-480D-942E-FE00D6474D50}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit.Tests\SIL.ServiceToolkit.Tests.csproj", "{C50ED15A-876D-42BF-980A-388E8C49C78D}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit.Tests\SIL.ServiceToolkit.Tests.csproj", "{C50ED15A-876D-42BF-980A-388E8C49C78D}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down Expand Up @@ -227,8 +223,7 @@ Global
{10657805-48F1-4205-B8F5-79447F6EF620} = {25CDB05B-4E24-4A6E-933E-1E0BEC97D74D}
{C3A14577-A654-4604-818C-4E683DD45A51} = {EA69B41C-49EF-4017-A687-44B9DF37FF98}
{0E40F959-C641-40A2-9750-B17A4F9F9E55} = {C3A14577-A654-4604-818C-4E683DD45A51}
{76123A14-29A5-480D-942E-FE00D6474D50} = {32B63C4B-AECD-4499-ADFB-69EF581B4F4C}
{1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} = {76123A14-29A5-480D-942E-FE00D6474D50}
{1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} = {EA69B41C-49EF-4017-A687-44B9DF37FF98}
{C50ED15A-876D-42BF-980A-388E8C49C78D} = {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf
(sp, cancellationToken) =>
sp.GetRequiredService<IDistributedReaderWriterLockFactory>().InitAsync(cancellationToken)
);
services.AddParallelCorpusPreprocessor();

var builder = new MachineBuilder(services, configuration);
if (configuration is null)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
namespace SIL.ServiceToolkit.Utils;
namespace SIL.ServiceToolkit.Services;

public class ParallelCorpusPreprocessingService : IParallelCorpusPreprocessingService
{
Expand Down Expand Up @@ -63,10 +63,8 @@ row.Ref is not ScriptureRef sr
return textCorpus.Where(row =>
row.Ref is not ScriptureRef sr
|| sc.Corpus.PretranslateChapters is null
|| (
IsInChapters(sr, sc.Corpus.PretranslateChapters)
|| IsInChapters(sr, sc.Corpus.PretranslateChapters)
&& !IsInChapters(sr, sc.Corpus.TrainOnChapters ?? new())
)
);
})
.ToArray();
Expand Down Expand Up @@ -111,9 +109,7 @@ row.Ref is not ScriptureRef sr
if (targetNonEmptyRows.Length > 0)
nonEmptyRows = targetNonEmptyRows;
if (nonEmptyRows.Length > 0)
{
row = nonEmptyRows[_random.Next(nonEmptyRows.Length)];
}
}
skipCount = row.RowCount - 1;
train(row);
Expand Down Expand Up @@ -300,9 +296,7 @@ private IEnumerable<Row> AlignPretranslateCorpus(ITextCorpus[] srcCorpora, IText
}
Row row = rows.First();
if (rows.Length > 1)
{
row = rows[_random.Next(rows.Length)];
}
if (rows.Select(r => r.Refs.Count).Distinct().Count() > 1)
skipCount = row.RowCount - 1;
yield return row;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<RootNamespace>SIL.ServiceToolkit</RootNamespace>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="coverlet.collector" Version="6.0.0">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.8.0" />
<PackageReference Include="NSubstitute" Version="5.1.0" />
<PackageReference Include="NSubstitute.Analyzers.CSharp" Version="1.0.16">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="NUnit" Version="4.0.1" />
<PackageReference Include="NUnit3TestAdapter" Version="4.5.0" />
<PackageReference Include="NUnit.Analyzers" Version="4.0.0">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="coverlet.collector" Version="6.0.0">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.8.0" />
<PackageReference Include="NSubstitute" Version="5.1.0" />
<PackageReference Include="NSubstitute.Analyzers.CSharp" Version="1.0.16">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="NUnit" Version="4.0.1" />
<PackageReference Include="NUnit3TestAdapter" Version="4.5.0" />
<PackageReference Include="NUnit.Analyzers" Version="4.0.0">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>

<ItemGroup>
<ProjectReference Include="../../src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj"/>
</ItemGroup>
<ItemGroup>
<ProjectReference Include="../../src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
namespace SIL.ServiceToolkit.Services;

[TestFixture]
public class ParallelCorpusPreprocessorServiceTests
{
private static readonly string TestDataPath = Path.Combine(
AppContext.BaseDirectory,
"..",
"..",
"..",
"Services",
"data"
);

[Test]
public void TestParallelCorpusPreprocessor()
{
ParallelCorpusPreprocessingService processor = new(new CorpusService());
List<ParallelCorpus> corpora =
[
new()
{
Id = "corpus1",
SourceCorpora =
[
new()
{
Id = "source-corpus1",
Language = "en",
Files =
[
new()
{
TextId = "textId1",
Format = FileFormat.Text,
Location = Path.Combine(TestDataPath, "source1.txt")
}
]
},
new()
{
Id = "source-corpus2",
Language = "en",
Files =
[
new()
{
TextId = "textId1",
Format = FileFormat.Text,
Location = Path.Combine(TestDataPath, "source2.txt")
}
]
}
],
TargetCorpora =
[
new()
{
Id = "target-corpus1",
Language = "en",
Files =
[
new()
{
TextId = "textId1",
Format = FileFormat.Text,
Location = Path.Combine(TestDataPath, "target1.txt")
}
]
}
]
}
];
int trainCount = 0;
int pretranslateCount = 0;
processor.Preprocess(
corpora,
row =>
{
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
trainCount++;
},
(row, corpus) =>
{
pretranslateCount++;
},
false
);
Assert.Multiple(() =>
{
Assert.That(trainCount, Is.EqualTo(2));
Assert.That(pretranslateCount, Is.EqualTo(3));
});
}
}

0 comments on commit f950d6e

Please sign in to comment.