From 266aa90afb9143cb100fd4b6b85ffa81e050e784 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 25 Oct 2024 13:06:26 -0400 Subject: [PATCH] Compiling but not working --- .../Corpora/NParallelTextCorpus.cs | 481 +++--------------- .../Corpora/TextCorpusEnumerator.cs | 14 +- 2 files changed, 89 insertions(+), 406 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 0530d0cb..dc2b4b6e 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -1,11 +1,9 @@ using System; -using System.Collections; using System.Collections.Generic; using System.Collections.Immutable; using System.Linq; using SIL.Extensions; using SIL.Linq; -using SIL.ObjectModel; using SIL.Scripture; namespace SIL.Machine.Corpora @@ -89,10 +87,9 @@ public override IEnumerable GetRows(IEnumerable textId } } - private bool AnyInRangeWithSegments(IList> listOfEnumerators) + private bool AnyInRangeWithSegments(IList rows) { - return listOfEnumerators.Any(e => e.Current.IsInRange) - && listOfEnumerators.All(e => !(e.Current.IsInRange && e.Current.Segment.Count == 0)); + return rows.Any(r => r.IsInRange) && rows.All(r => !(r.IsInRange && r.Segment.Count == 0)); } private IList MinRefIndexes(IList refs) @@ -123,19 +120,23 @@ private IEnumerable GetRows(IList> listOf List[] sameRefRows = new List[Corpora.Count]; bool[] completed = listOfEnumerators.Select(e => !e.MoveNext()).ToArray(); - while (!completed.Any()) + while (!completed.All(c => c)) { IList minRefIndexes; - IList currentRefs = listOfEnumerators.Select(e => e.Current.Ref).ToArray(); + IList currentRows = listOfEnumerators + .Where((e, i) => !completed[i]) + .Select(e => e.Current) + .ToArray(); try { - minRefIndexes = MinRefIndexes(currentRefs); + minRefIndexes = MinRefIndexes(currentRows.Select(e => e.Ref).ToArray()); } catch (ArgumentException) { - throw new CorpusAlignmentException(currentRefs.Select(r => r.ToString()).ToArray()); + throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); } - if (minRefIndexes.Count < N) + + if (minRefIndexes.Count < (N - completed.Count(c => c))) //then there are some non-min refs { IList nonMinRefIndexes = System.Linq.Enumerable.Range(0, N).Except(minRefIndexes).ToList(); IReadOnlyList allNonMinRows = nonMinRefIndexes @@ -169,8 +170,15 @@ private IEnumerable GetRows(IList> listOf NParallelTextRow row in CreateMinRefRows( rangeInfo, minEnumerators.Select(e => e.Current).ToList(), - nonMinEnumerators.Select(e => e.Current).ToList(), - allNonMinRows + nonMinRefIndexes, + forceInRange: minEnumerators + .Select(e => e.Current.TextId) + .Union(nonMinEnumerators.Select(e => e.Current.TextId)) + .Distinct() + .Count() == 1 + && nonMinEnumerators + .Select(e => !e.Current.IsRangeStart && e.Current.IsInRange) + .Any() ) ) { @@ -182,353 +190,60 @@ NParallelTextRow row in CreateMinRefRows( listOfEnumerators[i].MoveNext(); } } - // source is less than target - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows, - forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId - && !trgEnumerator.Current.IsRangeStart - && trgEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - - if ( - (!AllTargetRows && srcEnumerator.Current.IsInRange) - || (!AllSourceRows && trgEnumerator.Current.IsInRange) - ) - { - if (rangeInfo.IsInRange && AnyInRangeWithSegments(listOfEnumerators)) - { - yield return rangeInfo.CreateRow(); - } - - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) - { - foreach (TextRow prevSourceRow in sourceSameRefRows) - { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - prevSourceRow, - trgEnumerator.Current - ) - ) - { - yield return row; - } - } - } - - if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) - { - foreach (TextRow prevTargetRow in targetSameRefRows) - { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - prevTargetRow - ) - ) - { - yield return row; - } - } - } - - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - trgEnumerator.Current, - compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - if (compare < 0) - { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - NParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows, - forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId - && !trgEnumerator.Current.IsRangeStart - && trgEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - } - else if (compare > 0) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && srcEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - NParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows, - forceSourceInRange: trgEnumerator.Current.TextId == srcEnumerator.Current.TextId - && !srcEnumerator.Current.IsRangeStart - && srcEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); } - else - // compare == 0 - the refs are the same + else if (minRefIndexes.Count == (N - completed.Count(c => c))) + // the refs are all the same { if ( - (!AllTargetRows && srcEnumerator.Current.IsInRange) - || (!AllSourceRows && trgEnumerator.Current.IsInRange) + !currentRows.Select((r, i) => AllRowsList[i]).Any() + && currentRows.Select(r => r.IsInRange).Any() ) { - if ( - rangeInfo.IsInRange - && ( - ( - srcEnumerator.Current.IsInRange - && !trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - || ( - !srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - || ( - srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - && trgEnumerator.Current.Segment.Count > 0 - ) - ) - ) + if (rangeInfo.IsInRange && AnyInRangeWithSegments(currentRows)) { yield return rangeInfo.CreateRow(); } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + for (int i = 0; i < currentRows.Count; i++) + { + rangeInfo.AddTextRow(currentRows[i], i); + rangeInfo.Rows[i].SameRefRows.Clear(); + } } else { - if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) + foreach (var row in currentRows) //TODO walk through together { - foreach (TextRow prevSourceRow in sourceSameRefRows) + if (rangeInfo.CheckSameRefRows(row)) { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - prevSourceRow, - trgEnumerator.Current - ) - ) + foreach (TextRow tr in rangeInfo.Rows.SelectMany(r => r.SameRefRows)) { - yield return row; - } - } - } - - if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) - { - foreach (TextRow prevTargetRow in targetSameRefRows) - { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - prevTargetRow + foreach ( + NParallelTextRow r in CreateRows(rangeInfo, new List { tr, row }) ) - ) - { - yield return row; + { + yield return r; + } } } } - - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - trgEnumerator.Current, - compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null - ) - ) + foreach (NParallelTextRow row in CreateRows(rangeInfo, currentRows)) { yield return row; } } - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - } - - while (!srcCompleted) - { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - NParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows - ) - ) + for (int i = 0; i < currentRows.Count; i++) { - yield return row; + rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); } } - srcCompleted = !srcEnumerator.MoveNext(); - } - - while (!trgCompleted) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } else { - foreach ( - NParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows - ) - ) - { - yield return row; - } + throw new CorpusAlignmentException( + minRefIndexes.Select(i => currentRows[i].Ref.ToString()).ToArray() + ); } - trgCompleted = !trgEnumerator.MoveNext(); } if (rangeInfo.IsInRange) @@ -573,7 +288,7 @@ private IEnumerable CreateRows( else { refs.Add(refRefs); - flags.Add(forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); + flags.Add(forceInRange == null || !forceInRange[i] ? TextRowFlags.None : TextRowFlags.InRange); } } @@ -584,82 +299,35 @@ private IEnumerable CreateRows( }; } - private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) - { - try - { - if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) - sameRefRows.Clear(); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); - } - return sameRefRows.Count > 0; - } - private IEnumerable CreateMinRefRows( NRangeInfo rangeInfo, - IList currentRows, - IList minRefIndexes, + IList minRefRows, IList nonMinRefIndexes, bool forceInRange = false ) { - IList minRows = minRefIndexes.Select(i => currentRows[i]).ToList(); - IList nonMinRows = nonMinRefIndexes.Select(i => currentRows[i]).ToList(); - - if (CheckSameRefRows(targetSameRefRows, sourceRow)) - { - foreach (TextRow targetSameRefRow in targetSameRefRows) - { - foreach (NParallelTextRow row in CreateRows(rangeInfo, sourceRow, targetSameRefRow)) - yield return row; - } - } - else if (AllSourceRows) - { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - sourceRow, - null, - forceTargetInRange: forceTargetInRange - ) - ) - { - yield return row; - } - } - } + List sameRefRows = rangeInfo + .Rows.Where((r, i) => nonMinRefIndexes.Contains(i)) + .SelectMany(r => r.SameRefRows) + .ToList(); - private IEnumerable CreateTargetRows( - NRangeInfo rangeInfo, - TextRow targetRow, - List sourceSameRefRows, - bool forceSourceInRange = false - ) - { - if (CheckSameRefRows(sourceSameRefRows, targetRow)) + foreach (TextRow textRow in minRefRows) { - foreach (TextRow sourceSameRefRow in sourceSameRefRows) + if (rangeInfo.CheckSameRefRows(sameRefRows, textRow)) { - foreach (NParallelTextRow row in CreateRows(rangeInfo, sourceSameRefRow, targetRow)) - yield return row; - } - } - else if (AllTargetRows) - { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - null, - targetRow, - forceSourceInRange: forceSourceInRange - ) - ) - { - yield return row; + foreach (TextRow sameRefRow in sameRefRows) + { + foreach ( + NParallelTextRow row in CreateRows( + rangeInfo, + new List() { textRow, sameRefRow }, + forceInRange: new List() { false, forceInRange } + ) + ) + { + yield return row; + } + } } } } @@ -683,7 +351,7 @@ private class NRangeInfo public List Rows { get; } = new List(); public bool IsInRange => Rows.Any(r => r.IsInRange); - private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) + public bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) { try { @@ -697,6 +365,21 @@ private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) return sameRefRows.Count > 0; } + public bool CheckSameRefRows(TextRow row) + { + var sameRefRows = Rows.SelectMany(r => r.SameRefRows).ToList(); + try + { + if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, row.Ref) != 0) + sameRefRows.Clear(); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), row.Ref.ToString()); + } + return sameRefRows.Count > 0; + } + public void AddTextRow(TextRow row, int index) { if (N <= row.Segment.Count) @@ -727,7 +410,7 @@ public NParallelTextRow CreateRow() .ToArray(); } } - var nParRow = new NParallelTextRow(TextId, Rows.Select(r => r.Refs).ToArray()) + var nParRow = new NParallelTextRow(TextId, Rows.Select(r => r.Refs.ToList()).ToArray()) { NSegments = Rows.Select(r => r.Segment.ToArray()).ToArray(), NFlags = Rows.Select(r => r.IsSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None) diff --git a/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs index 592bfcc6..a0fed87b 100644 --- a/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs +++ b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs @@ -74,13 +74,13 @@ private void CollectVerses() do { TextRow row = _enumerator.Current; - var refRef = (ScriptureRef)row.Ref; - if (!prevRefRef.IsEmpty && refRef.BookNum != prevRefRef.BookNum) + var scrRef = (ScriptureRef)row.Ref; + if (!prevRefRef.IsEmpty && scrRef.BookNum != prevRefRef.BookNum) break; - refRef = refRef.ChangeVersification(_refVersification); + scrRef = scrRef.ChangeVersification(_refVersification); // convert one-to-many versification mapping to a verse range - if (refRef.Equals(prevRefRef)) + if (scrRef.Equals(prevRefRef)) { (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ rowList.Count + rangeStartOffset @@ -105,10 +105,10 @@ private void CollectVerses() { rangeStartOffset = -1; } - rowList.Add((refRef, row)); - if (!outOfOrder && refRef.CompareTo(prevRefRef) < 0) + rowList.Add((scrRef, row)); + if (!outOfOrder && scrRef.CompareTo(prevRefRef) < 0) outOfOrder = true; - prevRefRef = refRef; + prevRefRef = scrRef; _enumeratorHasMoreData = _enumerator.MoveNext(); } while (_enumeratorHasMoreData);