Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NParallelTextCorpus #270

Merged
merged 28 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e0fba03
broken
johnml1135 Oct 23, 2024
eb08370
more broken
johnml1135 Oct 23, 2024
fd76354
More broken.
johnml1135 Oct 23, 2024
266aa90
Compiling but not working
Enkidu93 Oct 25, 2024
e76177c
Progress
Enkidu93 Oct 30, 2024
d980661
More progress
Enkidu93 Oct 31, 2024
95393ec
Almost all tests passing
Enkidu93 Nov 1, 2024
6d2719f
All PTCorpus tests passing!
Enkidu93 Nov 4, 2024
c3ef946
Passing tests; added alignment corpus
Enkidu93 Nov 5, 2024
282c473
Fix test; add corpora extensions test
Enkidu93 Nov 5, 2024
bd0ec45
More fixes
Enkidu93 Nov 8, 2024
57b759d
Change naming to avoid confusion with 'Select'
Enkidu93 Nov 8, 2024
7858e20
Working NParallelTextCorpus
johnml1135 Oct 23, 2024
5a484d0
Change naming to avoid confusion with 'Select'
Enkidu93 Nov 8, 2024
7d47b9e
Merge branch 'nparallel_corpus' of https://github.com/sillsdev/machin…
Enkidu93 Nov 8, 2024
e07cf64
Reviewer-requested changes
Enkidu93 Nov 13, 2024
54ae315
Reviewer changes
Enkidu93 Nov 13, 2024
d97ea1c
Remove alignment corpus from NPTC; move logic to PTC
Enkidu93 Nov 14, 2024
37dde83
Remove redundant check
Enkidu93 Nov 14, 2024
b3bd7b9
Property to field
Enkidu93 Nov 14, 2024
0d351f2
Comments, small refactoring
Enkidu93 Nov 14, 2024
14697e4
More fixes
Enkidu93 Nov 14, 2024
2058d4d
Change mergedtextcorpus parameter
Enkidu93 Nov 15, 2024
6ab2faa
More reviewer-requested changes
Enkidu93 Nov 15, 2024
6ca6027
Move same ref rows out of range info
Enkidu93 Nov 15, 2024
5376918
Make seed optional; remove unneeded code
Enkidu93 Nov 18, 2024
4633385
Fix typo
Enkidu93 Nov 18, 2024
54419cf
Merge branch 'master' into nparallel_corpus
Enkidu93 Nov 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions src/SIL.Machine/Corpora/CorporaExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,33 @@ public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)

#endregion

#region INParallelTextCorpus operations

public static INParallelTextCorpus AlignMany(
this IEnumerable<ITextCorpus> corpora,
IEnumerable<bool> allRowsPerCorpus = null
)
{
NParallelTextCorpus nParallelTextCorpus = new NParallelTextCorpus(corpora);
if (allRowsPerCorpus != null)
{
nParallelTextCorpus.AllRows = allRowsPerCorpus.ToArray();
}
return nParallelTextCorpus;
}

public static ITextCorpus ChooseRandom(this IEnumerable<ITextCorpus> corpora, int? seed = null)
{
return new MergedTextCorpus(corpora, MergeRule.Random, seed);
}

public static ITextCorpus ChooseFirst(this IEnumerable<ITextCorpus> corpora)
{
return new MergedTextCorpus(corpora, MergeRule.First);
}

#endregion

#region IAlignmentCorpus operations

public static IAlignmentCorpus Transform(
Expand Down
5 changes: 5 additions & 0 deletions src/SIL.Machine/Corpora/CorpusAlignmentException.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,10 @@ public CorpusAlignmentException(string sourceRef, string targetRef)
: base(
$"Invalid format in {sourceRef} and {targetRef}. Mismatched key formats \"{sourceRef}\" and \"{targetRef}\". There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs."
) { }

public CorpusAlignmentException(string[] refs)
: base(
$"Invalid format in {string.Join(", ", refs)}. Mismatched key formats. There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs."
) { }
}
}
11 changes: 11 additions & 0 deletions src/SIL.Machine/Corpora/INParallelTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
using System.Collections.Generic;

namespace SIL.Machine.Corpora
{
public interface INParallelTextCorpus : ICorpus<NParallelTextRow>
{
int Count(bool includeEmpty = true, IEnumerable<string> textIds = null);

IEnumerable<NParallelTextRow> GetRows(IEnumerable<string> textIds);
}
}
8 changes: 8 additions & 0 deletions src/SIL.Machine/Corpora/MergeRule.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace SIL.Machine.Corpora
{
public enum MergeRule
{
First,
Random
}
}
77 changes: 77 additions & 0 deletions src/SIL.Machine/Corpora/MergedTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
using System;
using System.Collections.Generic;
using System.Linq;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
public class MergedTextCorpus : TextCorpusBase
{
private readonly NParallelTextCorpus _corpus;

private readonly MergeRule _mergeRule;

private readonly Random _random;

public MergedTextCorpus(IEnumerable<ITextCorpus> corpora, MergeRule mergeRule, int? seed = null)
{
_corpus = new NParallelTextCorpus(corpora) { AllRows = Enumerable.Repeat(true, corpora.Count()).ToArray() };
_mergeRule = mergeRule;
if (seed != null)
_random = new Random(seed.Value);
else
_random = new Random();
}

public override IEnumerable<IText> Texts => _corpus.Corpora.SelectMany(c => c.Texts);

public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i));

public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
int indexOfInRangeRow = -1;
foreach (NParallelTextRow nRow in _corpus.GetRows(textIds))
{
IReadOnlyList<int> nonEmptyIndices = nRow
.NSegments.Select((s, i) => (s, i))
.Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i))
.Select(pair => pair.i)
.ToList();
IReadOnlyList<int> indices =
nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList();
if (indexOfInRangeRow == -1)
{
indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList();
}
if (indices.Count == 0)
continue;
int indexOfSelectedRow = -1;
switch (_mergeRule)
{
case MergeRule.First:
indexOfSelectedRow = indices.First();
break;
case MergeRule.Random:
indexOfSelectedRow = indices[_random.Next(0, indices.Count)];
break;
}
indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow;
if (!nRow.IsInRange(indexOfSelectedRow))
{
indexOfInRangeRow = -1;
}
if (nRow.IsRangeStart(indexOfSelectedRow))
{
indexOfInRangeRow = indexOfSelectedRow;
}
yield return new TextRow(nRow.TextId, nRow.Ref)
{
Segment = nRow.NSegments[indexOfSelectedRow],
Flags = nRow.NFlags[indexOfSelectedRow]
};
}
}
}
}
Loading
Loading