diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index 7d974366..372ba201 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -523,6 +523,33 @@ public override IEnumerable GetRows(IEnumerable textIds) #endregion + #region INParallelTextCorpus operations + + public static INParallelTextCorpus AlignMany( + this IEnumerable corpora, + IEnumerable allRowsPerCorpus = null + ) + { + NParallelTextCorpus nParallelTextCorpus = new NParallelTextCorpus(corpora); + if (allRowsPerCorpus != null) + { + nParallelTextCorpus.AllRows = allRowsPerCorpus.ToArray(); + } + return nParallelTextCorpus; + } + + public static ITextCorpus ChooseRandom(this IEnumerable corpora, int? seed = null) + { + return new MergedTextCorpus(corpora, MergeRule.Random, seed); + } + + public static ITextCorpus ChooseFirst(this IEnumerable corpora) + { + return new MergedTextCorpus(corpora, MergeRule.First); + } + + #endregion + #region IAlignmentCorpus operations public static IAlignmentCorpus Transform( diff --git a/src/SIL.Machine/Corpora/CorpusAlignmentException.cs b/src/SIL.Machine/Corpora/CorpusAlignmentException.cs index c86dd8cf..2b812985 100644 --- a/src/SIL.Machine/Corpora/CorpusAlignmentException.cs +++ b/src/SIL.Machine/Corpora/CorpusAlignmentException.cs @@ -8,5 +8,10 @@ public CorpusAlignmentException(string sourceRef, string targetRef) : base( $"Invalid format in {sourceRef} and {targetRef}. Mismatched key formats \"{sourceRef}\" and \"{targetRef}\". There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs." ) { } + + public CorpusAlignmentException(string[] refs) + : base( + $"Invalid format in {string.Join(", ", refs)}. Mismatched key formats. There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs." + ) { } } } diff --git a/src/SIL.Machine/Corpora/INParallelTextCorpus.cs b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs new file mode 100644 index 00000000..5a1e86f7 --- /dev/null +++ b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs @@ -0,0 +1,11 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora +{ + public interface INParallelTextCorpus : ICorpus + { + int Count(bool includeEmpty = true, IEnumerable textIds = null); + + IEnumerable GetRows(IEnumerable textIds); + } +} diff --git a/src/SIL.Machine/Corpora/MergeRule.cs b/src/SIL.Machine/Corpora/MergeRule.cs new file mode 100644 index 00000000..be9a2cee --- /dev/null +++ b/src/SIL.Machine/Corpora/MergeRule.cs @@ -0,0 +1,8 @@ +namespace SIL.Machine.Corpora +{ + public enum MergeRule + { + First, + Random + } +} diff --git a/src/SIL.Machine/Corpora/MergedTextCorpus.cs b/src/SIL.Machine/Corpora/MergedTextCorpus.cs new file mode 100644 index 00000000..5e85b60b --- /dev/null +++ b/src/SIL.Machine/Corpora/MergedTextCorpus.cs @@ -0,0 +1,77 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class MergedTextCorpus : TextCorpusBase + { + private readonly NParallelTextCorpus _corpus; + + private readonly MergeRule _mergeRule; + + private readonly Random _random; + + public MergedTextCorpus(IEnumerable corpora, MergeRule mergeRule, int? seed = null) + { + _corpus = new NParallelTextCorpus(corpora) { AllRows = Enumerable.Repeat(true, corpora.Count()).ToArray() }; + _mergeRule = mergeRule; + if (seed != null) + _random = new Random(seed.Value); + else + _random = new Random(); + } + + public override IEnumerable Texts => _corpus.Corpora.SelectMany(c => c.Texts); + + public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i)); + + public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null; + + public override IEnumerable GetRows(IEnumerable textIds) + { + int indexOfInRangeRow = -1; + foreach (NParallelTextRow nRow in _corpus.GetRows(textIds)) + { + IReadOnlyList nonEmptyIndices = nRow + .NSegments.Select((s, i) => (s, i)) + .Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i)) + .Select(pair => pair.i) + .ToList(); + IReadOnlyList indices = + nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList(); + if (indexOfInRangeRow == -1) + { + indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList(); + } + if (indices.Count == 0) + continue; + int indexOfSelectedRow = -1; + switch (_mergeRule) + { + case MergeRule.First: + indexOfSelectedRow = indices.First(); + break; + case MergeRule.Random: + indexOfSelectedRow = indices[_random.Next(0, indices.Count)]; + break; + } + indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow; + if (!nRow.IsInRange(indexOfSelectedRow)) + { + indexOfInRangeRow = -1; + } + if (nRow.IsRangeStart(indexOfSelectedRow)) + { + indexOfInRangeRow = indexOfSelectedRow; + } + yield return new TextRow(nRow.TextId, nRow.Ref) + { + Segment = nRow.NSegments[indexOfSelectedRow], + Flags = nRow.NFlags[indexOfSelectedRow] + }; + } + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs new file mode 100644 index 00000000..c8f2c904 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -0,0 +1,523 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using SIL.Extensions; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class NParallelTextCorpus : NParallelTextCorpusBase + { + public NParallelTextCorpus(IEnumerable corpora, IComparer rowRefComparer = null) + { + Corpora = corpora.ToImmutableArray(); + if (Corpora.Count < 1) + throw new ArgumentException("There must be at least one corpora.", nameof(corpora)); + RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); + AllRows = new bool[Corpora.Count] + .Select(_ => false) + .ToImmutableArray(); + } + + public override bool IsTokenized(int i) => + i < Corpora.Count ? Corpora[i].IsTokenized : throw new ArgumentOutOfRangeException(nameof(i)); + + public override int N => Corpora.Count; + public IReadOnlyList AllRows { get; set; } + public override IReadOnlyList Corpora { get; } + public IComparer RowRefComparer { get; } + + private HashSet GetTextIdsFromCorpora() + { + HashSet textIds = new HashSet(); + HashSet allRowsTextIds = new HashSet(); + for (int i = 0; i < Corpora.Count; i++) + { + if (i == 0) + textIds.AddRange(Corpora[i].Texts.Select(t => t.Id)); + else + textIds.IntersectWith(Corpora[i].Texts.Select(t => t.Id)); + + if (AllRows[i]) + allRowsTextIds.AddRange(Corpora[i].Texts.Select(t => t.Id)); + } + textIds.UnionWith(allRowsTextIds); + return textIds; + } + + public override IEnumerable GetRows(IEnumerable textIds) + { + HashSet filterTextIds = GetTextIdsFromCorpora(); + + if (textIds != null) + filterTextIds.IntersectWith(textIds); + + List> enumeratedCorpora = new List>(); + try + { + for (int i = 0; i < Corpora.Count; i++) + { + IEnumerator enumerator = Corpora[i].GetRows(filterTextIds).GetEnumerator(); + enumeratedCorpora.Add( + new TextCorpusEnumerator(enumerator, Corpora[0].Versification, Corpora[i].Versification) + ); + } + foreach (NParallelTextRow row in GetRows(enumeratedCorpora)) + yield return row; + } + finally + { + foreach (IEnumerator enumerator in enumeratedCorpora) + enumerator.Dispose(); + } + } + + private static bool AllInRangeHaveSegments(IList rows) + { + return rows.All(r => (r.IsInRange && !r.IsEmpty) || (!r.IsInRange)); + } + + private IList MinRefIndexes(IList refs) + { + object minRef = refs[0]; + List minRefIndexes = new List() { 0 }; + for (int i = 1; i < refs.Count; i++) + { + if (RowRefComparer.Compare(refs[i], minRef) < 0) + { + minRef = refs[i]; + minRefIndexes.Clear(); + minRefIndexes.Add(i); + } + else if (RowRefComparer.Compare(refs[i], minRef) == 0) + { + minRefIndexes.Add(i); + } + } + return minRefIndexes; + } + + private IEnumerable GetRows(IList> enumerators) + { + var rangeInfo = new NRangeInfo(N) + { + Versifications = Corpora.Select(c => c.Versification).ToArray(), + RowRefComparer = RowRefComparer + }; + List> sameRefRows = new List>(); + for (int i = 0; i < N; i++) + { + sameRefRows.Add(new List()); + } + + bool[] completed = new bool[N]; + int numCompleted = 0; + for (int i = 0; i < N; i++) + { + bool isCompleted = !enumerators[i].MoveNext(); + completed[i] = isCompleted; + if (isCompleted) + numCompleted++; + } + int numberOfRemainingRows = N - numCompleted; + + while (numCompleted < N) + { + List minRefIndexes; + List currentRows = enumerators.Select(e => e.Current).ToList(); + try + { + minRefIndexes = MinRefIndexes( + currentRows + .Select( + (e, i) => + { + if (!completed[i]) + return e.Ref; + return null; + } + ) + .ToArray() + ) + .ToList(); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); + } + List nonMinRefIndexes = Enumerable.Range(0, N).Except(minRefIndexes).ToList(); + if (minRefIndexes.Count < numberOfRemainingRows || minRefIndexes.Count(i => !completed[i]) == 1) + //then there are some non-min refs or only one incomplete enumerator + { + if ( + nonMinRefIndexes.Any(i => !AllRows[i]) //At least one of the non-min rows has not been marked as 'all rows' + && minRefIndexes.Any(i => !completed[i] && currentRows[i].IsInRange) //and at least one of the min rows is not completed and in a range + ) + { + foreach (int i in minRefIndexes) + rangeInfo.AddTextRow(enumerators[i].Current, i); + foreach (int i in nonMinRefIndexes) + sameRefRows[i].Clear(); + } + else + { + bool anyNonMinEnumeratorsMidRange = nonMinRefIndexes.Any(i => + !completed[i] && !currentRows[i].IsRangeStart && currentRows[i].IsInRange + ); + foreach ( + NParallelTextRow row in CreateMinRefRows( + rangeInfo, + currentRows.ToArray(), + minRefIndexes.ToArray(), + nonMinRefIndexes.ToArray(), + sameRefRows, + forceInRange: minRefIndexes + .Select(i => + anyNonMinEnumeratorsMidRange + && nonMinRefIndexes.All(j => + !completed[j] && currentRows[j].TextId == currentRows[i].TextId + ) //All non-min rows have the same textId as the given min row + ) + .ToList() + ) + ) + { + yield return row; + } + } + foreach (int i in minRefIndexes) + { + sameRefRows[i].Add(enumerators[i].Current); + bool isCompleted = !enumerators[i].MoveNext(); + completed[i] = isCompleted; + if (isCompleted) + { + numCompleted++; + numberOfRemainingRows--; + } + } + } + else if (minRefIndexes.Count == numberOfRemainingRows) + // the refs are all the same + { + if ( + minRefIndexes.Any(i => + currentRows[i].IsInRange && minRefIndexes.All(j => j == i || !AllRows[j]) + ) //At least one row is in range while the other rows are all not marked as 'all rows' + ) + { + if ( + rangeInfo.IsInRange + && AllInRangeHaveSegments(currentRows.Where((r, i) => !completed[i]).ToArray()) + ) + { + yield return rangeInfo.CreateRow(); + } + + for (int i = 0; i < rangeInfo.Rows.Count; i++) + { + rangeInfo.AddTextRow(currentRows[i], i); + sameRefRows[i].Clear(); + } + } + else + { + foreach ( + NParallelTextRow row in CreateSameRefRows(rangeInfo, completed, currentRows, sameRefRows) + ) + { + yield return row; + } + + foreach ( + NParallelTextRow row in CreateRows( + rangeInfo, + currentRows.Select((r, i) => completed[i] ? null : r).ToArray() + ) + ) + { + yield return row; + } + } + + for (int i = 0; i < rangeInfo.Rows.Count; i++) + { + sameRefRows[i].Add(currentRows[i]); + bool isCompleted = !enumerators[i].MoveNext(); + completed[i] = isCompleted; + if (isCompleted) + { + numCompleted++; + numberOfRemainingRows--; + } + } + } + else + { + throw new CorpusAlignmentException( + minRefIndexes.Select(i => currentRows[i].Ref.ToString()).ToArray() + ); + } + } + + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); + } + + private object[] CorrectVersification(object[] refs, int i) + { + if (Corpora.Any(c => c.Versification == null) || refs.Length == 0) + return refs; + return refs.Cast() + .Select(r => r.ChangeVersification(Corpora[i].Versification)) + .Cast() + .ToArray(); + } + + private IEnumerable CreateRows( + NRangeInfo rangeInfo, + IReadOnlyList rows, + IReadOnlyList forceInRange = null + ) + { + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); + + if (rows.All(r => r == null)) + throw new ArgumentNullException("A corpus row must be specified."); + + object[] defaultRefs = new object[] { rows.Where(r => r != null).Select(r => r.Ref).First() }; + string textId = null; + object[][] refs = new object[N][]; + TextRowFlags[] flags = new TextRowFlags[N]; + for (int i = 0; i < rows.Count; i++) + { + if (rows[i] != null) + { + textId = textId ?? rows[i]?.TextId; + refs[i] = CorrectVersification(rows[i].Ref == null ? defaultRefs : new object[] { rows[i].Ref }, i); + flags[i] = rows[i].Flags; + } + else + { + if (Corpora[i].IsScripture()) + refs[i] = CorrectVersification(defaultRefs, i); + else + refs[i] = new object[] { }; + flags[i] = forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None; + } + } + refs = refs.Select(r => r ?? (new object[] { })).ToArray(); + + yield return new NParallelTextRow(textId, refs) + { + NSegments = rows.Select(r => r?.Segment ?? Array.Empty()).ToArray(), + NFlags = flags.ToReadOnlyList() + }; + } + + private IEnumerable CreateMinRefRows( + NRangeInfo rangeInfo, + IReadOnlyList currentRows, + IReadOnlyList minRefIndexes, + IReadOnlyList nonMinRefIndexes, + IReadOnlyList> sameRefRowsPerIndex, + IReadOnlyList forceInRange = null + ) + { + HashSet alreadyYielded = new HashSet(); + TextRow[] textRows; + foreach (int i in minRefIndexes) + { + TextRow textRow = currentRows[i]; + foreach (int j in nonMinRefIndexes) + { + IList sameRefRows = sameRefRowsPerIndex[j]; + if (CheckSameRefRows(sameRefRows, textRow)) + { + alreadyYielded.Add(i); + foreach (TextRow sameRefRow in sameRefRows) + { + textRows = new TextRow[N]; + textRows[i] = textRow; + textRows[j] = sameRefRow; + foreach ( + NParallelTextRow row in CreateRows(rangeInfo, textRows, forceInRange: forceInRange) + ) + { + yield return row; + } + } + } + } + } + textRows = new TextRow[N]; + var forceCurrentInRange = new bool[N]; + bool rowsHaveContent = false; + foreach (int i in minRefIndexes.Where(i => AllRows[i]).Except(alreadyYielded)) + { + TextRow textRow = currentRows[i]; + textRows[i] = textRow; + forceCurrentInRange[i] = forceCurrentInRange[i]; + rowsHaveContent = true; + } + if (rowsHaveContent) + { + foreach (NParallelTextRow row in CreateRows(rangeInfo, textRows, forceCurrentInRange)) + { + yield return row; + } + } + } + + private bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) + { + try + { + if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) + sameRefRows.Clear(); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); + } + return sameRefRows.Count > 0; + } + + private IEnumerable CreateSameRefRows( + NRangeInfo rangeInfo, + IList completed, + IList currentRows, + IReadOnlyList> sameRefRows + ) + { + for (int i = 0; i < N; i++) + { + if (completed[i]) + continue; + + for (int j = 0; j < N; j++) + { + if (i == j || completed[j]) + continue; + + if (CheckSameRefRows(sameRefRows[i], currentRows[j])) + { + foreach (TextRow tr in sameRefRows[i]) + { + var textRows = new TextRow[N]; + textRows[i] = tr; + textRows[j] = currentRows[j]; + foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) + { + yield return r; + } + } + } + } + } + } + + private class RangeRow + { + public IList Refs { get; } = new List(); + public IList Segment { get; } = new List(); + public bool IsSentenceStart { get; set; } = false; + public bool IsInRange => Refs.Count > 0; + public bool IsEmpty => Segment.Count == 0; + } + + private class NRangeInfo + { + public int N; + public string TextId { get; set; } = ""; + public ScrVers[] Versifications { get; set; } = null; + public IComparer RowRefComparer { get; set; } = null; + public List Rows { get; } + public bool IsInRange => Rows.Any(r => r.IsInRange); + + public NRangeInfo(int n) + { + N = n; + Rows = new List(); + for (int i = 0; i < N; i++) + { + Rows.Add(new RangeRow()); + } + } + + public void AddTextRow(TextRow row, int index) + { + if (N <= index) + { + throw new ArgumentOutOfRangeException( + $"There are only {N} parallel texts, but text {index} was chosen." + ); + } + TextId = row.TextId; + Rows[index].Refs.Add(row.Ref); + if (Rows[index].IsEmpty) + Rows[index].IsSentenceStart = row.IsSentenceStart; + Rows[index].Segment.AddRange(row.Segment); + } + + public NParallelTextRow CreateRow() + { + object[][] refs = new object[N][]; + List referenceRefs = Rows.Where(r => r.Refs.Count > 0) + .Select(r => r.Refs) + .FirstOrDefault() + .ToList(); + foreach (int i in Enumerable.Range(0, Rows.Count)) + { + var row = Rows[i]; + + if (Versifications.All(v => v != null) && row.Refs.Count() == 0) + { + refs[i] = referenceRefs + .ToArray() + .Cast() + .Select(r => r.ChangeVersification(Versifications[i])) + .Cast() + .ToArray(); + } + else + { + refs[i] = row.Refs.ToArray(); + } + } + var nParRow = new NParallelTextRow(TextId, refs) + { + NSegments = Rows.Select(r => r.Segment.ToArray()).ToArray(), + NFlags = Rows.Select(r => r.IsSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None) + .ToArray() + }; + TextId = ""; + foreach (RangeRow r in Rows) + { + r.Refs.Clear(); + r.Segment.Clear(); + r.IsSentenceStart = false; + } + return nParRow; + } + } + + public class DefaultRowRefComparer : IComparer + { + public int Compare(object x, object y) + { + // Do not use the default comparer for ScriptureRef, since we want to ignore segments + if (x is ScriptureRef sx && y is ScriptureRef sy) + return sx.CompareTo(sy, compareSegments: false); + if (x == null && y != null) + return 1; + if (x != null && y == null) + return -1; + + return Comparer.Default.Compare(x, y); + } + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs b/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs new file mode 100644 index 00000000..73ccf56f --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs @@ -0,0 +1,42 @@ +using System.Collections; +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.Corpora +{ + public abstract class NParallelTextCorpusBase : INParallelTextCorpus + { + public abstract int N { get; } + + public abstract IReadOnlyList Corpora { get; } + + public abstract bool IsTokenized(int i); + + int ICorpus.Count(bool includeEmpty) + { + return Count(includeEmpty, null); + } + + public virtual int Count(bool includeEmpty = true, IEnumerable textIds = null) + { + return includeEmpty ? GetRows(textIds).Count() : GetRows(textIds).Count(r => !r.IsEmpty); + } + + public IEnumerable GetRows() + { + return GetRows(null); + } + + public abstract IEnumerable GetRows(IEnumerable textIds); + + public IEnumerator GetEnumerator() + { + return GetRows().GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs new file mode 100644 index 00000000..4d58e907 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -0,0 +1,54 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using SIL.Extensions; + +namespace SIL.Machine.Corpora +{ + public class NParallelTextRow : IRow + { + public NParallelTextRow(string textId, IEnumerable> nRefs) + { + if (string.IsNullOrEmpty(textId)) + throw new ArgumentNullException(nameof(textId)); + + if (nRefs == null || nRefs.Where(r => r != null).SelectMany(r => r).Count() == 0) + throw new ArgumentNullException($"Refs must be provided but nRefs={nRefs}"); + + TextId = textId; + NRefs = nRefs.ToList().ToReadOnlyList(); + N = NRefs.Count; + NSegments = Enumerable.Range(0, N).Select(_ => Array.Empty()).ToImmutableArray(); + NFlags = Enumerable.Range(0, N).Select(_ => TextRowFlags.SentenceStart).ToImmutableArray(); + } + + public string TextId { get; } + + public object Ref => NRefs.SelectMany(r => r).First(); + + public IReadOnlyList> NRefs { get; } + public int N { get; } + + public IReadOnlyList> NSegments { get; set; } + public IReadOnlyList NFlags { get; set; } + + public bool IsSentenceStart(int i) => + NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.SentenceStart) : throw new ArgumentOutOfRangeException(); + + public bool IsInRange(int i) => + NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.InRange) : throw new ArgumentOutOfRangeException(); + + public bool IsRangeStart(int i) => + NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.RangeStart) : throw new ArgumentOutOfRangeException(); + + public bool IsEmpty => NSegments.All(s => s.Count == 0); + + public string Text(int i) => string.Join(" ", NSegments[i]); + + public NParallelTextRow Invert() + { + return new NParallelTextRow(TextId, NRefs.Reverse()) { NFlags = NFlags.Reverse().ToImmutableArray(), }; + } + } +} diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index 2f8a4884..53d07257 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -1,9 +1,6 @@ using System; -using System.Collections; using System.Collections.Generic; using System.Linq; -using SIL.ObjectModel; -using SIL.Scripture; namespace SIL.Machine.Corpora { @@ -19,7 +16,8 @@ public ParallelTextCorpus( SourceCorpus = sourceCorpus; TargetCorpus = targetCorpus; AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus(); - RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); + RowRefComparer = rowRefComparer ?? new NParallelTextCorpus.DefaultRowRefComparer(); + _nParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }); } public override bool IsSourceTokenized => SourceCorpus.IsTokenized; @@ -33,618 +31,47 @@ public ParallelTextCorpus( public IAlignmentCorpus AlignmentCorpus { get; } public IComparer RowRefComparer { get; } + private readonly NParallelTextCorpus _nParallelTextCorpus; + public override IEnumerable GetRows(IEnumerable textIds) { - IEnumerable sourceTextIds = SourceCorpus.Texts.Select(t => t.Id); - IEnumerable targetTextIds = TargetCorpus.Texts.Select(t => t.Id); - - HashSet filterTextIds; - if (AllSourceRows && AllTargetRows) - { - filterTextIds = new HashSet(sourceTextIds); - filterTextIds.UnionWith(targetTextIds); - } - else if (!AllSourceRows && !AllTargetRows) - { - filterTextIds = new HashSet(sourceTextIds); - filterTextIds.IntersectWith(targetTextIds); - } - else if (AllSourceRows) + using (IEnumerator alignmentEnumerator = AlignmentCorpus.GetEnumerator()) { - filterTextIds = new HashSet(sourceTextIds); - } - else - { - filterTextIds = new HashSet(targetTextIds); - } - - if (textIds != null) - filterTextIds.IntersectWith(textIds); - - using (IEnumerator srcEnumerator = SourceCorpus.GetRows(filterTextIds).GetEnumerator()) - using ( - var trgEnumerator = new TargetCorpusEnumerator( - TargetCorpus.GetRows(filterTextIds).GetEnumerator(), - SourceCorpus.Versification, - TargetCorpus.Versification - ) - ) - using ( - IEnumerator alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator() - ) - { - var rangeInfo = new RangeInfo { TargetVersification = TargetCorpus.Versification }; - var sourceSameRefRows = new List(); - var targetSameRefRows = new List(); - - bool srcCompleted = !srcEnumerator.MoveNext(); - bool trgCompleted = !trgEnumerator.MoveNext(); - while (!srcCompleted && !trgCompleted) + _nParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows }; + bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture(); + foreach (var nRow in _nParallelTextCorpus.GetRows(textIds)) { - int compare1 = 0; - try - { - compare1 = RowRefComparer.Compare(srcEnumerator.Current.Ref, trgEnumerator.Current.Ref); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException( - srcEnumerator.Current.Ref.ToString(), - trgEnumerator.Current.Ref.ToString() - ); - } - if (compare1 < 0) + int compareAlignmentCorpus = -1; + if (AlignmentCorpus != null && nRow.NSegments.All(s => s.Count > 0)) { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows, - forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId - && !trgEnumerator.Current.IsRangeStart - && trgEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - } - else if (compare1 > 0) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && srcEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows, - forceSourceInRange: trgEnumerator.Current.TextId == srcEnumerator.Current.TextId - && !srcEnumerator.Current.IsRangeStart - && srcEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - else - { - int compare2; do { try { - compare2 = alignmentEnumerator.MoveNext() - ? RowRefComparer.Compare(srcEnumerator.Current.Ref, alignmentEnumerator.Current.Ref) + compareAlignmentCorpus = alignmentEnumerator.MoveNext() + ? RowRefComparer.Compare(nRow.Ref, alignmentEnumerator.Current.Ref) : 1; } catch (ArgumentException) { - throw new CorpusAlignmentException( - srcEnumerator.Current.Ref.ToString(), - trgEnumerator.Current.Ref.ToString() - ); - } - } while (compare2 < 0); - - if ( - (!AllTargetRows && srcEnumerator.Current.IsInRange) - || (!AllSourceRows && trgEnumerator.Current.IsInRange) - ) - { - if ( - rangeInfo.IsInRange - && ( - ( - srcEnumerator.Current.IsInRange - && !trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - || ( - !srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - || ( - srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - && trgEnumerator.Current.Segment.Count > 0 - ) - ) - ) - { - yield return rangeInfo.CreateRow(); - } - - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) - { - foreach (TextRow prevSourceRow in sourceSameRefRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - prevSourceRow, - trgEnumerator.Current - ) - ) - { - yield return row; - } - } - } - - if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) - { - foreach (TextRow prevTargetRow in targetSameRefRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - prevTargetRow - ) - ) - { - yield return row; - } - } + throw new CorpusAlignmentException(nRow.NRefs.Select(r => r.ToString()).ToArray()); } - - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - trgEnumerator.Current, - compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); + } while (compareAlignmentCorpus < 0); } - } - - while (!srcCompleted) - { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateSourceRows(rangeInfo, srcEnumerator.Current, targetSameRefRows) - ) - { - yield return row; - } - } - srcCompleted = !srcEnumerator.MoveNext(); - } - - while (!trgCompleted) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateTargetRows(rangeInfo, trgEnumerator.Current, sourceSameRefRows) - ) - { - yield return row; - } - } - trgCompleted = !trgEnumerator.MoveNext(); - } - - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); - } - } - - private IEnumerable CreateRows( - RangeInfo rangeInfo, - TextRow srcRow, - TextRow trgRow, - IReadOnlyCollection alignedWordPairs = null, - bool forceSourceInRange = false, - bool forceTargetInRange = false - ) - { - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); - - string textId; - if (srcRow != null) - textId = srcRow.TextId; - else if (trgRow != null) - textId = trgRow.TextId; - else - throw new ArgumentNullException("Either a source or target must be specified."); - - object[] sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty(); - object[] targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty(); - if (targetRefs.Length == 0 && TargetCorpus.IsScripture()) - { - targetRefs = sourceRefs - .Cast() - .Select(r => r.ChangeVersification(TargetCorpus.Versification)) - .Cast() - .ToArray(); - } - - TextRowFlags sourceFlags; - if (srcRow == null) - sourceFlags = forceSourceInRange ? TextRowFlags.InRange : TextRowFlags.None; - else - sourceFlags = srcRow.Flags; - - TextRowFlags targetFlags; - if (trgRow == null) - targetFlags = forceTargetInRange ? TextRowFlags.InRange : TextRowFlags.None; - else - targetFlags = trgRow.Flags; - - yield return new ParallelTextRow(textId, sourceRefs, targetRefs) - { - SourceSegment = srcRow != null ? srcRow.Segment : Array.Empty(), - TargetSegment = trgRow != null ? trgRow.Segment : Array.Empty(), - AlignedWordPairs = alignedWordPairs, - SourceFlags = sourceFlags, - TargetFlags = targetFlags - }; - } - - private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) - { - try - { - if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) - sameRefRows.Clear(); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); - } - return sameRefRows.Count > 0; - } - - private IEnumerable CreateSourceRows( - RangeInfo rangeInfo, - TextRow sourceRow, - List targetSameRefRows, - bool forceTargetInRange = false - ) - { - if (CheckSameRefRows(targetSameRefRows, sourceRow)) - { - foreach (TextRow targetSameRefRow in targetSameRefRows) - { - foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceRow, targetSameRefRow)) - yield return row; - } - } - else if (AllSourceRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - sourceRow, - null, - forceTargetInRange: forceTargetInRange + yield return new ParallelTextRow( + nRow.TextId, + nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref }, + nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref } ) - ) - { - yield return row; - } - } - } - - private IEnumerable CreateTargetRows( - RangeInfo rangeInfo, - TextRow targetRow, - List sourceSameRefRows, - bool forceSourceInRange = false - ) - { - if (CheckSameRefRows(sourceSameRefRows, targetRow)) - { - foreach (TextRow sourceSameRefRow in sourceSameRefRows) - { - foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceSameRefRow, targetRow)) - yield return row; - } - } - else if (AllTargetRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - null, - targetRow, - forceSourceInRange: forceSourceInRange - ) - ) - { - yield return row; - } - } - } - - private class RangeInfo - { - public string TextId { get; set; } = ""; - public List SourceRefs { get; } = new List(); - public List TargetRefs { get; } = new List(); - public List SourceSegment { get; } = new List(); - public List TargetSegment { get; } = new List(); - public bool IsSourceSentenceStart { get; set; } = false; - public bool IsTargetSentenceStart { get; set; } = false; - public bool IsInRange => SourceRefs.Count > 0 || TargetRefs.Count > 0; - public bool IsSourceEmpty => SourceSegment.Count == 0; - public bool IsTargetEmpty => TargetSegment.Count == 0; - - public ScrVers TargetVersification { get; set; } = null; - - public ParallelTextRow CreateRow() - { - object[] trgRefs = TargetRefs.ToArray(); - if (TargetRefs.Count == 0 && TargetVersification != null) - { - trgRefs = SourceRefs - .ToArray() - .Cast() - .Select(r => r.ChangeVersification(TargetVersification)) - .Cast() - .ToArray(); - } - var row = new ParallelTextRow(TextId, SourceRefs.ToArray(), trgRefs) - { - SourceSegment = SourceSegment.ToArray(), - TargetSegment = TargetSegment.ToArray(), - SourceFlags = IsSourceSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None, - TargetFlags = IsTargetSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None - }; - TextId = ""; - SourceRefs.Clear(); - TargetRefs.Clear(); - SourceSegment.Clear(); - TargetSegment.Clear(); - IsSourceSentenceStart = false; - IsTargetSentenceStart = false; - return row; - } - } - - private class DefaultRowRefComparer : IComparer - { - public int Compare(object x, object y) - { - // Do not use the default comparer for ScriptureRef, since we want to ignore segments - if (x is ScriptureRef sx && y is ScriptureRef sy) - return sx.CompareTo(sy, compareSegments: false); - - return Comparer.Default.Compare(x, y); - } - } - - private class TargetCorpusEnumerator : DisposableBase, IEnumerator - { - private readonly IEnumerator _enumerator; - private readonly bool _isScripture = false; - private readonly Queue _verseRows; - private readonly ScrVers _sourceVersification; - private TextRow _current; - private bool _isEnumerating = false; - private bool _enumeratorHasMoreData = true; - - public TargetCorpusEnumerator( - IEnumerator enumerator, - ScrVers sourceVersification, - ScrVers targetVersification - ) - { - _enumerator = enumerator; - _sourceVersification = sourceVersification; - _isScripture = - sourceVersification != null - && targetVersification != null - && sourceVersification != targetVersification; - _verseRows = new Queue(); - } - - public TextRow Current => _current; - - object IEnumerator.Current => Current; - - public bool MoveNext() - { - if (_isScripture) - { - if (!_isEnumerating) { - _enumerator.MoveNext(); - _isEnumerating = true; - } - if (_verseRows.Count == 0 && _enumerator.Current != null && _enumeratorHasMoreData) - CollectVerses(); - if (_verseRows.Count > 0) - { - _current = _verseRows.Dequeue(); - return true; - } - _current = null; - return false; + SourceFlags = nRow.NFlags[0], + TargetFlags = nRow.NFlags[1], + SourceSegment = nRow.NSegments[0], + TargetSegment = nRow.NSegments[1], + AlignedWordPairs = + compareAlignmentCorpus == 0 ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() : null + }; } - - _enumeratorHasMoreData = _enumerator.MoveNext(); - _current = _enumerator.Current; - return _enumeratorHasMoreData; - } - - public void Reset() - { - _enumerator.Reset(); - _isEnumerating = false; - _enumeratorHasMoreData = true; - } - - protected override void DisposeManagedResources() - { - _enumerator.Dispose(); - } - - private void CollectVerses() - { - var rowList = new List<(ScriptureRef Ref, TextRow Row)>(); - bool outOfOrder = false; - ScriptureRef prevScrRef = ScriptureRef.Empty; - int rangeStartOffset = -1; - do - { - TextRow row = _enumerator.Current; - var scrRef = (ScriptureRef)row.Ref; - if (!prevScrRef.IsEmpty && scrRef.BookNum != prevScrRef.BookNum) - break; - - scrRef = scrRef.ChangeVersification(_sourceVersification); - // convert one-to-many versification mapping to a verse range - if (scrRef.Equals(prevScrRef)) - { - (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ - rowList.Count + rangeStartOffset - ]; - TextRowFlags flags = TextRowFlags.InRange; - if (rangeStartRow.IsSentenceStart) - flags |= TextRowFlags.SentenceStart; - if (rangeStartOffset == -1 && (!rangeStartRow.IsInRange || rangeStartRow.IsRangeStart)) - flags |= TextRowFlags.RangeStart; - rowList[rowList.Count + rangeStartOffset] = ( - rangeStartVerseRef, - new TextRow(rangeStartRow.TextId, rangeStartRow.Ref) - { - Segment = rangeStartRow.Segment.Concat(row.Segment).ToArray(), - Flags = flags - } - ); - row = new TextRow(row.TextId, row.Ref) { Flags = TextRowFlags.InRange }; - rangeStartOffset--; - } - else - { - rangeStartOffset = -1; - } - rowList.Add((scrRef, row)); - if (!outOfOrder && scrRef.CompareTo(prevScrRef) < 0) - outOfOrder = true; - prevScrRef = scrRef; - _enumeratorHasMoreData = _enumerator.MoveNext(); - } while (_enumeratorHasMoreData); - - if (outOfOrder) - rowList.Sort((x, y) => x.Ref.CompareTo(y.Ref)); - - foreach ((ScriptureRef _, TextRow row) in rowList) - _verseRows.Enqueue(row); } } } diff --git a/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs new file mode 100644 index 00000000..7653a135 --- /dev/null +++ b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs @@ -0,0 +1,122 @@ +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using SIL.ObjectModel; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + internal class TextCorpusEnumerator : DisposableBase, IEnumerator + { + private readonly IEnumerator _enumerator; + private readonly bool _isScripture = false; + private readonly Queue _verseRows; + private readonly ScrVers _refVersification; + private TextRow _current; + private bool _isEnumerating = false; + private bool _enumeratorHasMoreData = true; + + public TextCorpusEnumerator(IEnumerator enumerator, ScrVers refVersification, ScrVers versification) + { + _enumerator = enumerator; + _refVersification = refVersification; + _isScripture = refVersification != null && versification != null && refVersification != versification; + _verseRows = new Queue(); + } + + public TextRow Current => _current; + + object IEnumerator.Current => Current; + + public bool MoveNext() + { + if (_isScripture) + { + if (!_isEnumerating) + { + _enumerator.MoveNext(); + _isEnumerating = true; + } + if (_verseRows.Count == 0 && _enumerator.Current != null && _enumeratorHasMoreData) + CollectVerses(); + if (_verseRows.Count > 0) + { + _current = _verseRows.Dequeue(); + return true; + } + _current = null; + return false; + } + + _enumeratorHasMoreData = _enumerator.MoveNext(); + _current = _enumerator.Current; + return _enumeratorHasMoreData; + } + + public void Reset() + { + _enumerator.Reset(); + _isEnumerating = false; + _enumeratorHasMoreData = true; + } + + protected override void DisposeManagedResources() + { + _enumerator.Dispose(); + } + + private void CollectVerses() + { + var rowList = new List<(ScriptureRef Ref, TextRow Row)>(); + bool outOfOrder = false; + ScriptureRef prevRefRef = ScriptureRef.Empty; + int rangeStartOffset = -1; + do + { + TextRow row = _enumerator.Current; + var refRef = (ScriptureRef)row.Ref; + if (!prevRefRef.IsEmpty && refRef.BookNum != prevRefRef.BookNum) + break; + + refRef = refRef.ChangeVersification(_refVersification); + // convert one-to-many versification mapping to a verse range + if (refRef.Equals(prevRefRef)) + { + (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ + rowList.Count + rangeStartOffset + ]; + TextRowFlags flags = TextRowFlags.InRange; + if (rangeStartRow.IsSentenceStart) + flags |= TextRowFlags.SentenceStart; + if (rangeStartOffset == -1 && (!rangeStartRow.IsInRange || rangeStartRow.IsRangeStart)) + flags |= TextRowFlags.RangeStart; + rowList[rowList.Count + rangeStartOffset] = ( + rangeStartVerseRef, + new TextRow(rangeStartRow.TextId, rangeStartRow.Ref) + { + Segment = rangeStartRow.Segment.Concat(row.Segment).ToArray(), + Flags = flags + } + ); + row = new TextRow(row.TextId, row.Ref) { Flags = TextRowFlags.InRange }; + rangeStartOffset--; + } + else + { + rangeStartOffset = -1; + } + rowList.Add((refRef, row)); + if (!outOfOrder && refRef.CompareTo(prevRefRef) < 0) + outOfOrder = true; + prevRefRef = refRef; + _enumeratorHasMoreData = _enumerator.MoveNext(); + } while (_enumeratorHasMoreData); + + if (outOfOrder) + rowList.Sort((x, y) => x.Ref.CompareTo(y.Ref)); + + foreach ((ScriptureRef _, TextRow row) in rowList) + _verseRows.Enqueue(row); + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index 29b645b9..836f3bda 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -1,4 +1,5 @@ -using NUnit.Framework; +using System.Text.Json; +using NUnit.Framework; using SIL.Scripture; namespace SIL.Machine.Corpora; @@ -64,4 +65,230 @@ public void ExtractScripture() Assert.That(origRef, Is.EqualTo(new VerseRef("MAT 2:12", ScrVers.Original))); Assert.That(corpusRef, Is.EqualTo(new VerseRef("MAT 2:12", corpus.Versification))); } + + [Test] + public void MergedCorpus_SelectFirst() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source 1 segment 1 ."), TextRow("text1", 3) }) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseFirst(); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 2 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 3 segment 3 .")); + } + + [Test] + public void MergedCorpus_SelectRandom_Seed123456() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseRandom(123456); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.Multiple(() => + { + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 1 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 1 segment 3 .")); + }); + } + + [Test] + public void MergedCorpus_SelectRandom_Seed4501() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseRandom(4501); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.Multiple(() => + { + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 2 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 3 segment 3 .")); + }); + } + + [Test] + public void AlignMergedCorpora() + { + var sourceCorpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var sourceCorpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var sourceCorpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + + ITextCorpus sourceCorpus = new List { sourceCorpus1, sourceCorpus2, sourceCorpus3 }.ChooseFirst(); + + var targetCorpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 1 segment 1 ."), + TextRow("text1", 2, "target 1 segment 2 ."), + TextRow("text1", 3, "target 1 segment 3 .") + } + ) + ); + var targetCorpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 2 segment 1 ."), + TextRow("text1", 2, "target 2 segment 2 ."), + TextRow("text1", 3, "target 2 segment 3 .") + } + ) + ); + var targetCorpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 3 segment 1 ."), + TextRow("text1", 2, "target 3 segment 2 ."), + TextRow("text1", 3, "target 3 segment 3 .") + } + ) + ); + + ITextCorpus targetCorpus = new List { targetCorpus1, targetCorpus2, targetCorpus3 }.ChooseFirst(); + + IParallelTextCorpus alignedCorpus = sourceCorpus.AlignRows(targetCorpus); + ParallelTextRow[] rows = alignedCorpus.GetRows().ToArray(); + Assert.That(rows, Has.Length.EqualTo(3)); + Assert.That(rows[0].SourceText, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[2].TargetText, Is.EqualTo("target 1 segment 3 .")); + } + + private static TextRow TextRow( + string textId, + object rowRef, + string text = "", + TextRowFlags flags = TextRowFlags.SentenceStart + ) + { + return new TextRow(textId, rowRef) + { + Segment = text.Length == 0 ? Array.Empty() : text.Split(), + Flags = flags + }; + } } diff --git a/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs new file mode 100644 index 00000000..68dc9f90 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs @@ -0,0 +1,570 @@ +using System.Text.Json; +using NUnit.Framework; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class NParallelTextCorpusTests +{ + [Test] + public void GetRows_ThreeCorpora() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 1)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 1 .".Split()))); + Assert.That(rows[0].IsSentenceStart(0), Is.False); + Assert.That(rows[0].IsSentenceStart(1), Is.True); + Assert.That(rows[2].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[2].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[2].IsSentenceStart(1), Is.False); + Assert.That(rows[2].IsSentenceStart(2), Is.True); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(1)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[0].IsSentenceStart(0), Is.True); + Assert.That(rows[0].IsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_AllAllRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[2].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[2].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[2].IsSentenceStart(0), Is.True); + Assert.That(rows[2].IsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_SomeAllRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, false, true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2)); + Assert.That(rows[1].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[1].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[1].IsSentenceStart(0), Is.True); + Assert.That(rows[1].IsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_AllAllRows_MissingMiddle() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2)); + Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split()))); + Assert.That(rows[1].IsSentenceStart(1), Is.True); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_MissingLastRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source segment 1 ."), }) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source segment 1 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, false, false] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2)); + Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split()))); + Assert.That(rows[1].IsSentenceStart(0), Is.True); + } + + [Test] + public void GetRows_OneCorpus() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1]) { AllRows = [true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 1)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 1 .".Split()))); + Assert.That(rows[0].IsSentenceStart(0), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_Range() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 ."), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 ."), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.SequenceEqual([2, 3]))); + Assert.That(rows[1].NSegments[0], Is.EqualTo("source segment 2 . source segment 3 .".Split())); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(1), JsonSerializer.Serialize(rows)); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllIndividualRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [false, false, true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1 })); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllRangeOneThroughTwoRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [false, true, false] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1, 2 })); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllRangeTwoThroughThreeRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, false, false] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1 })); + } + + [Test] + public void GetRows_ThreeCorpora_SameRefManyToMany() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(10)); + } + + private static TextRow TextRow( + string textId, + object rowRef, + string text = "", + TextRowFlags flags = TextRowFlags.SentenceStart + ) + { + return new TextRow(textId, rowRef) + { + Segment = text.Length == 0 ? Array.Empty() : text.Split(), + Flags = flags + }; + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs index d40529c6..8df6d787 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs @@ -444,6 +444,55 @@ public void GetRows_OverlappingRanges() Assert.That(rows[0].IsTargetSentenceStart, Is.True); } + [Test] + public void GetRows_OverlappingRangesAndMissingRows() + { + var sourceCorpus = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var targetCorpus = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 3, + "target segment 3 . target segment 4 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 4, flags: TextRowFlags.InRange) + } + ) + ); + + var parallelCorpus = new ParallelTextCorpus(sourceCorpus, targetCorpus); + ParallelTextRow[] rows = parallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(1)); + Assert.That(rows[0].SourceRefs, Is.EqualTo(new[] { 1, 2, 3 })); + Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 3, 4 })); + Assert.That( + rows[0].SourceSegment, + Is.EqualTo("source segment 1 . source segment 2 . source segment 3 .".Split()) + ); + Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 3 . target segment 4 .".Split())); + Assert.That(rows[0].IsSourceSentenceStart, Is.True); + Assert.That(rows[0].IsTargetSentenceStart, Is.True); + } + [Test] public void GetRows_AdjacentRangesSameText() {