From 27180f4090ca8d723745a6e02d930d9b0e677de4 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 25 Jul 2024 18:16:39 -0400 Subject: [PATCH 01/15] *Add custom exception for parsing *Fix off-by-one error *Handle triplicate, quadruplicate, n-plicate verses *Add test to cover triplicate verse --- .../ScriptureRefUsfmParserHandlerBase.cs | 2 +- src/SIL.Machine/Corpora/UsfmParser.cs | 749 +++++++++--------- .../Corpora/UsfmParsingException.cs | 14 + src/SIL.Machine/Corpora/UsfmTextUpdater.cs | 3 +- .../Corpora/UsfmMemoryTextTests.cs | 16 + 5 files changed, 414 insertions(+), 370 deletions(-) create mode 100644 src/SIL.Machine/Corpora/UsfmParsingException.cs diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index 7d9e3391d..f7e9d5b73 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -53,7 +53,7 @@ public override void Verse( string pubNumber ) { - if (state.VerseRef.Equals(_curVerseRef)) + if (state.VerseRef.Equals(_curVerseRef) && !_duplicateVerse) { EndVerseText(state, CreateVerseRefs()); // ignore duplicate verses diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index c17afb387..40b4a91b7 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using SIL.Scripture; @@ -138,288 +139,382 @@ public void ProcessTokens() /// false if there were no more tokens process public bool ProcessToken() { - // If past end - if (State.Index >= State.Tokens.Count - 1) + try { - CloseAll(); - Handler?.EndUsfm(State); - return false; - } - else if (State.Index < 0) - { - Handler?.StartUsfm(State); - } + // If past end + if (State.Index >= State.Tokens.Count - 1) + { + CloseAll(); + Handler?.EndUsfm(State); + return false; + } + else if (State.Index < 0) + { + Handler?.StartUsfm(State); + } - // Move to next token - State.Index++; + // Move to next token + State.Index++; - State.LineNumber = State.Token.LineNumber; - State.ColumnNumber = State.Token.ColumnNumber; + State.LineNumber = State.Token.LineNumber; + State.ColumnNumber = State.Token.ColumnNumber; - // Update verse offset with previous token (since verse offset is from start of current token) - if (State.PrevToken != null) - State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace); + // Update verse offset with previous token (since verse offset is from start of current token) + if (State.PrevToken != null) + State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace); - // Skip over tokens that are to be skipped, ensuring that - // SpecialToken state is true. - if (State.SpecialTokenCount > 0) - { - State.SpecialTokenCount--; - State.SpecialToken = true; - return true; - } + // Skip over tokens that are to be skipped, ensuring that + // SpecialToken state is true. + if (State.SpecialTokenCount > 0) + { + State.SpecialTokenCount--; + State.SpecialToken = true; + return true; + } - // Reset special token and figure status - State.SpecialToken = false; + // Reset special token and figure status + State.SpecialToken = false; - UsfmToken token = State.Token; + UsfmToken token = State.Token; - // Switch unknown types to either character or paragraph - UsfmTokenType tokenType = token.Type; - if (tokenType == UsfmTokenType.Unknown) - tokenType = DetermineUnknownTokenType(); + // Switch unknown types to either character or paragraph + UsfmTokenType tokenType = token.Type; + if (tokenType == UsfmTokenType.Unknown) + tokenType = DetermineUnknownTokenType(); - if (Handler != null && !string.IsNullOrEmpty(token.Marker)) - Handler.GotMarker(State, token.Marker); + if (Handler != null && !string.IsNullOrEmpty(token.Marker)) + Handler.GotMarker(State, token.Marker); - // Close open elements - switch (tokenType) - { - case UsfmTokenType.Book: - case UsfmTokenType.Chapter: - CloseAll(); - break; - case UsfmTokenType.Paragraph: - // Handle special case of table rows - if (token.Marker == "tr") - { - // Close all but table and sidebar - while ( - State.Stack.Count > 0 - && State.Peek().Type != UsfmElementType.Table - && State.Peek().Type != UsfmElementType.Sidebar - ) + // Close open elements + switch (tokenType) + { + case UsfmTokenType.Book: + case UsfmTokenType.Chapter: + CloseAll(); + break; + case UsfmTokenType.Paragraph: + // Handle special case of table rows + if (token.Marker == "tr") { - CloseElement(); - } + // Close all but table and sidebar + while ( + State.Stack.Count > 0 + && State.Peek().Type != UsfmElementType.Table + && State.Peek().Type != UsfmElementType.Sidebar + ) + { + CloseElement(); + } - break; - } + break; + } - // Handle special case of sidebars - if (token.Marker == "esb") - { - // Close all - CloseAll(); - break; - } + // Handle special case of sidebars + if (token.Marker == "esb") + { + // Close all + CloseAll(); + break; + } - // Close all but sidebar - while (State.Stack.Count > 0 && State.Peek().Type != UsfmElementType.Sidebar) - CloseElement(); - break; - case UsfmTokenType.Character: - // Handle special case of table cell - if (IsCell(token)) - { - // Close until row - while (State.Peek().Type != UsfmElementType.Row) + // Close all but sidebar + while (State.Stack.Count > 0 && State.Peek().Type != UsfmElementType.Sidebar) CloseElement(); break; - } + case UsfmTokenType.Character: + // Handle special case of table cell + if (IsCell(token)) + { + // Close until row + while (State.Peek().Type != UsfmElementType.Row) + CloseElement(); + break; + } - // Handle refs - if (IsRef(token)) - { - // Refs don't close anything - break; - } + // Handle refs + if (IsRef(token)) + { + // Refs don't close anything + break; + } - // If non-nested character style, close all character styles - if (!token.Marker.StartsWith("+")) - CloseCharStyles(); - break; - case UsfmTokenType.Verse: - UsfmTag paraTag = State.ParaTag; - if (paraTag != null && paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != 0) - CloseAll(); - else + // If non-nested character style, close all character styles + if (!token.Marker.StartsWith("+")) + CloseCharStyles(); + break; + case UsfmTokenType.Verse: + UsfmTag paraTag = State.ParaTag; + if (paraTag != null && paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != 0) + CloseAll(); + else + CloseNote(); + break; + case UsfmTokenType.Note: CloseNote(); - break; - case UsfmTokenType.Note: - CloseNote(); - break; - case UsfmTokenType.End: - // If end marker for an active note - if (State.Stack.Any(e => e.Type == UsfmElementType.Note && (e.Marker + "*" == token.Marker))) - { - CloseNote(closed: true); break; - } - - // If end marker for a character style on stack, close it - // If no matching end marker, close all character styles on top of stack - UsfmParserElement elem; - bool unmatched = true; - while (State.Stack.Count > 0) - { - elem = State.Peek(); - if (elem.Type != UsfmElementType.Char) + case UsfmTokenType.End: + // If end marker for an active note + if (State.Stack.Any(e => e.Type == UsfmElementType.Note && (e.Marker + "*" == token.Marker))) + { + CloseNote(closed: true); break; + } - // Determine if a + prefix is needed to close it (was nested char style) - bool plusPrefix = - State.Stack.Count > 1 && State.Stack[State.Stack.Count - 2].Type == UsfmElementType.Char; + // If end marker for a character style on stack, close it + // If no matching end marker, close all character styles on top of stack + UsfmParserElement elem; + bool unmatched = true; + while (State.Stack.Count > 0) + { + elem = State.Peek(); + if (elem.Type != UsfmElementType.Char) + break; + + // Determine if a + prefix is needed to close it (was nested char style) + bool plusPrefix = + State.Stack.Count > 1 + && State.Stack[State.Stack.Count - 2].Type == UsfmElementType.Char; + + // If is a match + if ((plusPrefix ? "+" : "") + elem.Marker + "*" == token.Marker) + { + CloseElement(closed: true); + + unmatched = false; + break; + } + else + { + CloseElement(); + } + } + + // Unmatched end marker + if (unmatched) + Handler?.Unmatched(State, token.Marker); + break; + } + + VerseRef vref; + // Handle tokens + switch (tokenType) + { + case UsfmTokenType.Book: + State.Push(new UsfmParserElement(UsfmElementType.Book, token.Marker)); + + // Code is always upper case + string code = token.Data.ToUpperInvariant(); + + vref = State.VerseRef; + // Update verse ref. Leave book alone if not empty to prevent parsing errors + // on books with bad id lines. + if (vref.Book == "" && Canon.BookIdToNumber(code) != 0) + vref.Book = code; + vref.ChapterNum = 1; + vref.VerseNum = 0; + State.VerseRef = vref; + State.VerseOffset = 0; - // If is a match - if ((plusPrefix ? "+" : "") + elem.Marker + "*" == token.Marker) + // Book start. + Handler?.StartBook(State, token.Marker, code); + break; + case UsfmTokenType.Chapter: + // Get alternate chapter number + string altChapter = null; + string pubChapter = null; + if ( + State.Index < State.Tokens.Count - 3 + && State.Tokens[State.Index + 1].Marker == "ca" + && State.Tokens[State.Index + 2].Text != null + && State.Tokens[State.Index + 3].Marker == "ca*" + ) { - CloseElement(closed: true); + altChapter = State.Tokens[State.Index + 2].Text.Trim(); + State.SpecialTokenCount += 3; - unmatched = false; - break; + // Skip blank space after if present + if ( + State.Index + State.SpecialTokenCount < State.Tokens.Count - 1 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text != null + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text.Trim().Length == 0 + ) + { + State.SpecialTokenCount++; + } } - else + + // Get publishable chapter number + if ( + State.Index + State.SpecialTokenCount < State.Tokens.Count - 2 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "cp" + && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null + ) { - CloseElement(); + pubChapter = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); + State.SpecialTokenCount += 2; } - } - // Unmatched end marker - if (unmatched) - Handler?.Unmatched(State, token.Marker); - break; - } + // Chapter + vref = State.VerseRef; + vref.Chapter = token.Data; + vref.VerseNum = 0; + State.VerseRef = vref; + // Verse offset is not zeroed for chapter 1, as it is part of intro + if (State.VerseRef.ChapterNum != 1) + State.VerseOffset = 0; - VerseRef vref; - // Handle tokens - switch (tokenType) - { - case UsfmTokenType.Book: - State.Push(new UsfmParserElement(UsfmElementType.Book, token.Marker)); - - // Code is always upper case - string code = token.Data.ToUpperInvariant(); - - vref = State.VerseRef; - // Update verse ref. Leave book alone if not empty to prevent parsing errors - // on books with bad id lines. - if (vref.Book == "" && Canon.BookIdToNumber(code) != 0) - vref.Book = code; - vref.ChapterNum = 1; - vref.VerseNum = 0; - State.VerseRef = vref; - State.VerseOffset = 0; - - // Book start. - Handler?.StartBook(State, token.Marker, code); - break; - case UsfmTokenType.Chapter: - // Get alternate chapter number - string altChapter = null; - string pubChapter = null; - if ( - State.Index < State.Tokens.Count - 3 - && State.Tokens[State.Index + 1].Marker == "ca" - && State.Tokens[State.Index + 2].Text != null - && State.Tokens[State.Index + 3].Marker == "ca*" - ) - { - altChapter = State.Tokens[State.Index + 2].Text.Trim(); - State.SpecialTokenCount += 3; - - // Skip blank space after if present + Handler?.Chapter(State, token.Data, token.Marker, altChapter, pubChapter); + break; + case UsfmTokenType.Verse: + string pubVerse = null; + string altVerse = null; if ( - State.Index + State.SpecialTokenCount < State.Tokens.Count - 1 - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text != null - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text.Trim().Length == 0 + State.Index < State.Tokens.Count - 3 + && State.Tokens[State.Index + 1].Marker == "va" + && State.Tokens[State.Index + 2].Text != null + && State.Tokens[State.Index + 3].Marker == "va*" ) { - State.SpecialTokenCount++; + // Get alternate verse number + altVerse = State.Tokens[State.Index + 2].Text.Trim(); + State.SpecialTokenCount += 3; + } + if ( + State.Index + State.SpecialTokenCount < State.Tokens.Count - 3 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "vp" + && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null + && State.Tokens[State.Index + State.SpecialTokenCount + 3].Marker == "vp*" + ) + { + // Get publishable verse number + pubVerse = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); + State.SpecialTokenCount += 3; } - } - - // Get publishable chapter number - if ( - State.Index + State.SpecialTokenCount < State.Tokens.Count - 2 - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "cp" - && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null - ) - { - pubChapter = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); - State.SpecialTokenCount += 2; - } - - // Chapter - vref = State.VerseRef; - vref.Chapter = token.Data; - vref.VerseNum = 0; - State.VerseRef = vref; - // Verse offset is not zeroed for chapter 1, as it is part of intro - if (State.VerseRef.ChapterNum != 1) + + // Verse + vref = State.VerseRef; + vref.Verse = token.Data; + State.VerseRef = vref; State.VerseOffset = 0; - Handler?.Chapter(State, token.Data, token.Marker, altChapter, pubChapter); - break; - case UsfmTokenType.Verse: - string pubVerse = null; - string altVerse = null; - if ( - State.Index < State.Tokens.Count - 3 - && State.Tokens[State.Index + 1].Marker == "va" - && State.Tokens[State.Index + 2].Text != null - && State.Tokens[State.Index + 3].Marker == "va*" - ) - { - // Get alternate verse number - altVerse = State.Tokens[State.Index + 2].Text.Trim(); - State.SpecialTokenCount += 3; - } - if ( - State.Index + State.SpecialTokenCount < State.Tokens.Count - 3 - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "vp" - && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null - && State.Tokens[State.Index + State.SpecialTokenCount + 3].Marker == "vp*" - ) - { - // Get publishable verse number - pubVerse = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); - State.SpecialTokenCount += 3; - } - - // Verse - vref = State.VerseRef; - vref.Verse = token.Data; - State.VerseRef = vref; - State.VerseOffset = 0; - - Handler?.Verse(State, token.Data, token.Marker, altVerse, pubVerse); - break; - case UsfmTokenType.Paragraph: - // Handle special case of table rows - if (token.Marker == "tr") - { - // Start table if not open - if (State.Stack.All(e => e.Type != UsfmElementType.Table)) + Handler?.Verse(State, token.Data, token.Marker, altVerse, pubVerse); + break; + case UsfmTokenType.Paragraph: + // Handle special case of table rows + if (token.Marker == "tr") + { + // Start table if not open + if (State.Stack.All(e => e.Type != UsfmElementType.Table)) + { + State.Push(new UsfmParserElement(UsfmElementType.Table, null)); + Handler?.StartTable(State); + } + + State.Push(new UsfmParserElement(UsfmElementType.Row, token.Marker)); + + // Row start + Handler?.StartRow(State, token.Marker); + break; + } + + // Handle special case of sidebars + if (token.Marker == "esb") { - State.Push(new UsfmParserElement(UsfmElementType.Table, null)); - Handler?.StartTable(State); + State.Push(new UsfmParserElement(UsfmElementType.Sidebar, token.Marker)); + + // Look for category + string sidebarCategory = null; + if ( + State.Index < State.Tokens.Count - 3 + && State.Tokens[State.Index + 1].Marker == "cat" + && State.Tokens[State.Index + 2].Text != null + && State.Tokens[State.Index + 3].Marker == "cat*" + ) + { + // Get category + sidebarCategory = State.Tokens[State.Index + 2].Text.Trim(); + State.SpecialTokenCount += 3; + } + + Handler?.StartSidebar(State, token.Marker, sidebarCategory); + break; } - State.Push(new UsfmParserElement(UsfmElementType.Row, token.Marker)); + // Close sidebar if in sidebar + if (token.Marker == "esbe") + { + if (State.Stack.Any(e => e.Type == UsfmElementType.Sidebar)) + { + while (State.Stack.Count > 0) + CloseElement(State.Peek().Type == UsfmElementType.Sidebar); + } + else + { + Handler?.Unmatched(State, token.Marker); + } + break; + } - // Row start - Handler?.StartRow(State, token.Marker); + State.Push(new UsfmParserElement(UsfmElementType.Para, token.Marker)); + + // Paragraph opening + Handler?.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes); break; - } + case UsfmTokenType.Character: + // Handle special case of table cells (treated as special character style) + if (IsCell(token)) + { + string align = "start"; + if (token.Marker.Length > 2 && token.Marker[2] == 'c') + align = "center"; + else if (token.Marker.Length > 2 && token.Marker[2] == 'r') + align = "end"; + + UsfmStylesheet.IsCellRange(token.Marker, out string baseMarker, out int colspan); + State.Push(new UsfmParserElement(UsfmElementType.Cell, baseMarker)); + + Handler?.StartCell(State, baseMarker, align, colspan); + break; + } + + if (IsRef(token)) + { + // xrefs are special tokens (they do not stand alone) + State.SpecialToken = true; + + ParseDisplayAndTarget(out string display, out string target); + + State.SpecialTokenCount += 2; - // Handle special case of sidebars - if (token.Marker == "esb") - { - State.Push(new UsfmParserElement(UsfmElementType.Sidebar, token.Marker)); + Handler?.Ref(State, token.Marker, display, target); + break; + } + string actualMarker; + bool invalidMarker = false; + if (token.Marker.StartsWith("+")) + { + // Only strip + if properly nested + UsfmTag charTag = State.CharTag; + actualMarker = charTag != null ? token.Marker.TrimStart('+') : token.Marker; + invalidMarker = charTag == null; + } + else + { + actualMarker = token.Marker; + } + + State.Push(new UsfmParserElement(UsfmElementType.Char, actualMarker, token.Attributes)); + Handler?.StartChar( + State, + actualMarker, + token.Type == UsfmTokenType.Unknown || invalidMarker, + token.Attributes + ); + break; + case UsfmTokenType.Note: // Look for category - string sidebarCategory = null; + string noteCategory = null; if ( State.Index < State.Tokens.Count - 3 && State.Tokens[State.Index + 1].Marker == "cat" @@ -428,148 +523,66 @@ public bool ProcessToken() ) { // Get category - sidebarCategory = State.Tokens[State.Index + 2].Text.Trim(); + noteCategory = State.Tokens[State.Index + 2].Text.Trim(); State.SpecialTokenCount += 3; } - Handler?.StartSidebar(State, token.Marker, sidebarCategory); + State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker)); + + Handler?.StartNote(State, token.Marker, token.Data, noteCategory); break; - } + case UsfmTokenType.Text: + string text = token.Text; - // Close sidebar if in sidebar - if (token.Marker == "esbe") - { - if (State.Stack.Any(e => e.Type == UsfmElementType.Sidebar)) + // If last token before a paragraph, book or chapter, esb, esbe (both are paragraph types), + // or at very end, strip final space + // This is because USFM requires these to be on a new line, therefore adding whitespace + if ( + ( + State.Index == State.Tokens.Count - 1 + || State.Tokens[State.Index + 1].Type == UsfmTokenType.Paragraph + || State.Tokens[State.Index + 1].Type == UsfmTokenType.Book + || State.Tokens[State.Index + 1].Type == UsfmTokenType.Chapter + ) + && text.Length > 0 + && text[text.Length - 1] == ' ' + ) { - while (State.Stack.Count > 0) - CloseElement(State.Peek().Type == UsfmElementType.Sidebar); + text = text.Substring(0, text.Length - 1); } - else + + if (Handler != null) { - Handler?.Unmatched(State, token.Marker); + // Replace ~ with nbsp + text = text.Replace('~', '\u00A0'); + + // Replace // with + foreach (string str in OptBreakSplitter.Split(text)) + { + if (str == "//") + Handler.OptBreak(State); + else + Handler.Text(State, str); + } } break; - } - - State.Push(new UsfmParserElement(UsfmElementType.Para, token.Marker)); - - // Paragraph opening - Handler?.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes); - break; - case UsfmTokenType.Character: - // Handle special case of table cells (treated as special character style) - if (IsCell(token)) - { - string align = "start"; - if (token.Marker.Length > 2 && token.Marker[2] == 'c') - align = "center"; - else if (token.Marker.Length > 2 && token.Marker[2] == 'r') - align = "end"; - - UsfmStylesheet.IsCellRange(token.Marker, out string baseMarker, out int colspan); - State.Push(new UsfmParserElement(UsfmElementType.Cell, baseMarker)); - - Handler?.StartCell(State, baseMarker, align, colspan); - break; - } - - if (IsRef(token)) - { - // xrefs are special tokens (they do not stand alone) - State.SpecialToken = true; - - ParseDisplayAndTarget(out string display, out string target); - - State.SpecialTokenCount += 2; - Handler?.Ref(State, token.Marker, display, target); + case UsfmTokenType.Milestone: + case UsfmTokenType.MilestoneEnd: + // currently, parse state doesn't need to be update, so just inform the handler about the milestone. + Handler?.Milestone( + State, + token.Marker, + token.Type == UsfmTokenType.Milestone, + token.Attributes + ); break; - } - - string actualMarker; - bool invalidMarker = false; - if (token.Marker.StartsWith("+")) - { - // Only strip + if properly nested - UsfmTag charTag = State.CharTag; - actualMarker = charTag != null ? token.Marker.TrimStart('+') : token.Marker; - invalidMarker = charTag == null; - } - else - { - actualMarker = token.Marker; - } - - State.Push(new UsfmParserElement(UsfmElementType.Char, actualMarker, token.Attributes)); - Handler?.StartChar( - State, - actualMarker, - token.Type == UsfmTokenType.Unknown || invalidMarker, - token.Attributes - ); - break; - case UsfmTokenType.Note: - // Look for category - string noteCategory = null; - if ( - State.Index < State.Tokens.Count - 3 - && State.Tokens[State.Index + 1].Marker == "cat" - && State.Tokens[State.Index + 2].Text != null - && State.Tokens[State.Index + 3].Marker == "cat*" - ) - { - // Get category - noteCategory = State.Tokens[State.Index + 2].Text.Trim(); - State.SpecialTokenCount += 3; - } - - State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker)); - - Handler?.StartNote(State, token.Marker, token.Data, noteCategory); - break; - case UsfmTokenType.Text: - string text = token.Text; - - // If last token before a paragraph, book or chapter, esb, esbe (both are paragraph types), - // or at very end, strip final space - // This is because USFM requires these to be on a new line, therefore adding whitespace - if ( - ( - State.Index == State.Tokens.Count - 1 - || State.Tokens[State.Index + 1].Type == UsfmTokenType.Paragraph - || State.Tokens[State.Index + 1].Type == UsfmTokenType.Book - || State.Tokens[State.Index + 1].Type == UsfmTokenType.Chapter - ) - && text.Length > 0 - && text[text.Length - 1] == ' ' - ) - { - text = text.Substring(0, text.Length - 1); - } - - if (Handler != null) - { - // Replace ~ with nbsp - text = text.Replace('~', '\u00A0'); - - // Replace // with - foreach (string str in OptBreakSplitter.Split(text)) - { - if (str == "//") - Handler.OptBreak(State); - else - Handler.Text(State, str); - } - } - break; - - case UsfmTokenType.Milestone: - case UsfmTokenType.MilestoneEnd: - // currently, parse state doesn't need to be update, so just inform the handler about the milestone. - Handler?.Milestone(State, token.Marker, token.Type == UsfmTokenType.Milestone, token.Attributes); - break; + } + } + catch (Exception e) + { + throw new UsfmParsingException(State, e); } - return true; } diff --git a/src/SIL.Machine/Corpora/UsfmParsingException.cs b/src/SIL.Machine/Corpora/UsfmParsingException.cs new file mode 100644 index 000000000..b3bcbbd59 --- /dev/null +++ b/src/SIL.Machine/Corpora/UsfmParsingException.cs @@ -0,0 +1,14 @@ +using System; +using System.Linq; + +namespace SIL.Machine.Corpora +{ + public class UsfmParsingException : Exception + { + public UsfmParsingException(UsfmParserState state, Exception exception) + : base( + $"Failed to parse at line {state.LineNumber} column {state.ColumnNumber} verse ref {state.VerseRef} with surrounding tokens [{string.Join(",", state.Tokens.ToList().GetRange(Math.Max(state.Index - 3, 0), Math.Min(7, state.Tokens.Count - (state.Index - 3))).Select(t => $"{t.Text} (TokenType={t.Type})"))}]", + exception + ) { } + } +} diff --git a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs index 9265a3178..4ac11cd78 100644 --- a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs +++ b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs @@ -361,7 +361,7 @@ private void SkipTokens(UsfmParserState state) private bool ReplaceWithNewTokens(UsfmParserState state) { bool newText = _replace.Count > 0 && _replace.Peek(); - int tokenEnd = state.Index + state.SpecialTokenCount + 1; + int tokenEnd = state.Index + state.SpecialTokenCount; bool existingText = false; for (int index = _tokenIndex; index <= tokenEnd; index++) { @@ -393,6 +393,7 @@ private void PushTokensAsPrevious() private void PopNewTokens() { + // if (_replace.Any()) _replace.Pop(); } } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index b046be229..5a472b1f4 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -70,6 +70,22 @@ public void GetRows_DuplicateVerseWithTable() Assert.That(rows, Has.Length.EqualTo(5)); } + [Test] + public void GetRows_TriplicateVerse() + { + TextRow[] rows = GetRows( + @"\id MAT - Test +\c 1 +\v 1 First verse +\v 1 First verse +\v 1 First verse +", + includeAllText: true + ); + + Assert.That(rows, Has.Length.EqualTo(1)); + } + [Test] public void GetRows_VersePara_BeginningNonVerseSegment() { From a9db2ef1ce99d66ec41b2f946e37621458dd7847 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 26 Jul 2024 11:31:47 -0400 Subject: [PATCH 02/15] Update test; add better error messages to tests --- .../Corpora/UsfmMemoryTextTests.cs | 38 +++++++++++++++---- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index 5a472b1f4..890c9fc22 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -25,8 +25,16 @@ public void GetRows_VerseDescriptiveTitle() { Assert.That(rows, Has.Length.EqualTo(1)); - Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1"))); - Assert.That(rows[0].Text, Is.EqualTo("Descriptive title")); + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); + Assert.That( + rows[0].Text, + Is.EqualTo("Descriptive title"), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); }); } @@ -44,8 +52,16 @@ public void GetRows_LastSegment() { Assert.That(rows, Has.Length.EqualTo(1)); - Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1"))); - Assert.That(rows[0].Text, Is.EqualTo("Last segment")); + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); + Assert.That( + rows[0].Text, + Is.EqualTo("Last segment"), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); }); } @@ -67,7 +83,7 @@ public void GetRows_DuplicateVerseWithTable() includeAllText: true ); - Assert.That(rows, Has.Length.EqualTo(5)); + Assert.That(rows, Has.Length.EqualTo(5), string.Join(",", rows.ToList().Select(tr => tr.Text))); } [Test] @@ -77,13 +93,19 @@ public void GetRows_TriplicateVerse() @"\id MAT - Test \c 1 \v 1 First verse +\rem non verse \v 1 First verse +\rem non verse \v 1 First verse +\v 2 Second verse ", includeAllText: true ); - - Assert.That(rows, Has.Length.EqualTo(1)); + Assert.Multiple(() => + { + Assert.That(rows[0].Text, Is.EqualTo("First verse"), string.Join(",", rows.ToList().Select(tr => tr.Text))); + Assert.That(rows, Has.Length.EqualTo(4), string.Join(",", rows.ToList().Select(tr => tr.Text))); + }); } [Test] @@ -104,7 +126,7 @@ public void GetRows_VersePara_BeginningNonVerseSegment() includeAllText: true ); - Assert.That(rows, Has.Length.EqualTo(4)); + Assert.That(rows, Has.Length.EqualTo(4), string.Join(",", rows.ToList().Select(tr => tr.Text))); } private static TextRow[] GetRows(string usfm, bool includeMarkers = false, bool includeAllText = false) From 4242e7c9974350112967cd9b717001bd9d3dd5d1 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 26 Jul 2024 11:41:45 -0400 Subject: [PATCH 03/15] Move try-catch to ProcessTokens --- src/SIL.Machine/Corpora/UsfmParser.cs | 759 +++++++++++++------------- 1 file changed, 379 insertions(+), 380 deletions(-) diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index 40b4a91b7..1047eb24d 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -130,7 +130,18 @@ bool preserveWhitespace /// public void ProcessTokens() { - while (ProcessToken()) { } + bool continueProcessing = true; + while (continueProcessing) + { + try + { + continueProcessing = ProcessToken(); + } + catch (Exception e) + { + throw new UsfmParsingException(State, e); + } + } } /// @@ -139,382 +150,288 @@ public void ProcessTokens() /// false if there were no more tokens process public bool ProcessToken() { - try + // If past end + if (State.Index >= State.Tokens.Count - 1) { - // If past end - if (State.Index >= State.Tokens.Count - 1) - { - CloseAll(); - Handler?.EndUsfm(State); - return false; - } - else if (State.Index < 0) - { - Handler?.StartUsfm(State); - } - - // Move to next token - State.Index++; - - State.LineNumber = State.Token.LineNumber; - State.ColumnNumber = State.Token.ColumnNumber; + CloseAll(); + Handler?.EndUsfm(State); + return false; + } + else if (State.Index < 0) + { + Handler?.StartUsfm(State); + } - // Update verse offset with previous token (since verse offset is from start of current token) - if (State.PrevToken != null) - State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace); + // Move to next token + State.Index++; - // Skip over tokens that are to be skipped, ensuring that - // SpecialToken state is true. - if (State.SpecialTokenCount > 0) - { - State.SpecialTokenCount--; - State.SpecialToken = true; - return true; - } + State.LineNumber = State.Token.LineNumber; + State.ColumnNumber = State.Token.ColumnNumber; - // Reset special token and figure status - State.SpecialToken = false; + // Update verse offset with previous token (since verse offset is from start of current token) + if (State.PrevToken != null) + State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace); - UsfmToken token = State.Token; + // Skip over tokens that are to be skipped, ensuring that + // SpecialToken state is true. + if (State.SpecialTokenCount > 0) + { + State.SpecialTokenCount--; + State.SpecialToken = true; + return true; + } - // Switch unknown types to either character or paragraph - UsfmTokenType tokenType = token.Type; - if (tokenType == UsfmTokenType.Unknown) - tokenType = DetermineUnknownTokenType(); + // Reset special token and figure status + State.SpecialToken = false; - if (Handler != null && !string.IsNullOrEmpty(token.Marker)) - Handler.GotMarker(State, token.Marker); + UsfmToken token = State.Token; - // Close open elements - switch (tokenType) - { - case UsfmTokenType.Book: - case UsfmTokenType.Chapter: - CloseAll(); - break; - case UsfmTokenType.Paragraph: - // Handle special case of table rows - if (token.Marker == "tr") - { - // Close all but table and sidebar - while ( - State.Stack.Count > 0 - && State.Peek().Type != UsfmElementType.Table - && State.Peek().Type != UsfmElementType.Sidebar - ) - { - CloseElement(); - } + // Switch unknown types to either character or paragraph + UsfmTokenType tokenType = token.Type; + if (tokenType == UsfmTokenType.Unknown) + tokenType = DetermineUnknownTokenType(); - break; - } + if (Handler != null && !string.IsNullOrEmpty(token.Marker)) + Handler.GotMarker(State, token.Marker); - // Handle special case of sidebars - if (token.Marker == "esb") + // Close open elements + switch (tokenType) + { + case UsfmTokenType.Book: + case UsfmTokenType.Chapter: + CloseAll(); + break; + case UsfmTokenType.Paragraph: + // Handle special case of table rows + if (token.Marker == "tr") + { + // Close all but table and sidebar + while ( + State.Stack.Count > 0 + && State.Peek().Type != UsfmElementType.Table + && State.Peek().Type != UsfmElementType.Sidebar + ) { - // Close all - CloseAll(); - break; + CloseElement(); } - // Close all but sidebar - while (State.Stack.Count > 0 && State.Peek().Type != UsfmElementType.Sidebar) - CloseElement(); break; - case UsfmTokenType.Character: - // Handle special case of table cell - if (IsCell(token)) - { - // Close until row - while (State.Peek().Type != UsfmElementType.Row) - CloseElement(); - break; - } + } - // Handle refs - if (IsRef(token)) - { - // Refs don't close anything - break; - } + // Handle special case of sidebars + if (token.Marker == "esb") + { + // Close all + CloseAll(); + break; + } - // If non-nested character style, close all character styles - if (!token.Marker.StartsWith("+")) - CloseCharStyles(); + // Close all but sidebar + while (State.Stack.Count > 0 && State.Peek().Type != UsfmElementType.Sidebar) + CloseElement(); + break; + case UsfmTokenType.Character: + // Handle special case of table cell + if (IsCell(token)) + { + // Close until row + while (State.Peek().Type != UsfmElementType.Row) + CloseElement(); break; - case UsfmTokenType.Verse: - UsfmTag paraTag = State.ParaTag; - if (paraTag != null && paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != 0) - CloseAll(); - else - CloseNote(); + } + + // Handle refs + if (IsRef(token)) + { + // Refs don't close anything break; - case UsfmTokenType.Note: + } + + // If non-nested character style, close all character styles + if (!token.Marker.StartsWith("+")) + CloseCharStyles(); + break; + case UsfmTokenType.Verse: + UsfmTag paraTag = State.ParaTag; + if (paraTag != null && paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != 0) + CloseAll(); + else CloseNote(); + break; + case UsfmTokenType.Note: + CloseNote(); + break; + case UsfmTokenType.End: + // If end marker for an active note + if (State.Stack.Any(e => e.Type == UsfmElementType.Note && (e.Marker + "*" == token.Marker))) + { + CloseNote(closed: true); break; - case UsfmTokenType.End: - // If end marker for an active note - if (State.Stack.Any(e => e.Type == UsfmElementType.Note && (e.Marker + "*" == token.Marker))) - { - CloseNote(closed: true); + } + + // If end marker for a character style on stack, close it + // If no matching end marker, close all character styles on top of stack + UsfmParserElement elem; + bool unmatched = true; + while (State.Stack.Count > 0) + { + elem = State.Peek(); + if (elem.Type != UsfmElementType.Char) break; - } - // If end marker for a character style on stack, close it - // If no matching end marker, close all character styles on top of stack - UsfmParserElement elem; - bool unmatched = true; - while (State.Stack.Count > 0) - { - elem = State.Peek(); - if (elem.Type != UsfmElementType.Char) - break; - - // Determine if a + prefix is needed to close it (was nested char style) - bool plusPrefix = - State.Stack.Count > 1 - && State.Stack[State.Stack.Count - 2].Type == UsfmElementType.Char; - - // If is a match - if ((plusPrefix ? "+" : "") + elem.Marker + "*" == token.Marker) - { - CloseElement(closed: true); - - unmatched = false; - break; - } - else - { - CloseElement(); - } - } + // Determine if a + prefix is needed to close it (was nested char style) + bool plusPrefix = + State.Stack.Count > 1 && State.Stack[State.Stack.Count - 2].Type == UsfmElementType.Char; - // Unmatched end marker - if (unmatched) - Handler?.Unmatched(State, token.Marker); - break; - } - - VerseRef vref; - // Handle tokens - switch (tokenType) - { - case UsfmTokenType.Book: - State.Push(new UsfmParserElement(UsfmElementType.Book, token.Marker)); - - // Code is always upper case - string code = token.Data.ToUpperInvariant(); - - vref = State.VerseRef; - // Update verse ref. Leave book alone if not empty to prevent parsing errors - // on books with bad id lines. - if (vref.Book == "" && Canon.BookIdToNumber(code) != 0) - vref.Book = code; - vref.ChapterNum = 1; - vref.VerseNum = 0; - State.VerseRef = vref; - State.VerseOffset = 0; - - // Book start. - Handler?.StartBook(State, token.Marker, code); - break; - case UsfmTokenType.Chapter: - // Get alternate chapter number - string altChapter = null; - string pubChapter = null; - if ( - State.Index < State.Tokens.Count - 3 - && State.Tokens[State.Index + 1].Marker == "ca" - && State.Tokens[State.Index + 2].Text != null - && State.Tokens[State.Index + 3].Marker == "ca*" - ) + // If is a match + if ((plusPrefix ? "+" : "") + elem.Marker + "*" == token.Marker) { - altChapter = State.Tokens[State.Index + 2].Text.Trim(); - State.SpecialTokenCount += 3; + CloseElement(closed: true); - // Skip blank space after if present - if ( - State.Index + State.SpecialTokenCount < State.Tokens.Count - 1 - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text != null - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text.Trim().Length == 0 - ) - { - State.SpecialTokenCount++; - } + unmatched = false; + break; } - - // Get publishable chapter number - if ( - State.Index + State.SpecialTokenCount < State.Tokens.Count - 2 - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "cp" - && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null - ) + else { - pubChapter = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); - State.SpecialTokenCount += 2; + CloseElement(); } + } - // Chapter - vref = State.VerseRef; - vref.Chapter = token.Data; - vref.VerseNum = 0; - State.VerseRef = vref; - // Verse offset is not zeroed for chapter 1, as it is part of intro - if (State.VerseRef.ChapterNum != 1) - State.VerseOffset = 0; + // Unmatched end marker + if (unmatched) + Handler?.Unmatched(State, token.Marker); + break; + } - Handler?.Chapter(State, token.Data, token.Marker, altChapter, pubChapter); - break; - case UsfmTokenType.Verse: - string pubVerse = null; - string altVerse = null; - if ( - State.Index < State.Tokens.Count - 3 - && State.Tokens[State.Index + 1].Marker == "va" - && State.Tokens[State.Index + 2].Text != null - && State.Tokens[State.Index + 3].Marker == "va*" - ) - { - // Get alternate verse number - altVerse = State.Tokens[State.Index + 2].Text.Trim(); - State.SpecialTokenCount += 3; - } + VerseRef vref; + // Handle tokens + switch (tokenType) + { + case UsfmTokenType.Book: + State.Push(new UsfmParserElement(UsfmElementType.Book, token.Marker)); + + // Code is always upper case + string code = token.Data.ToUpperInvariant(); + + vref = State.VerseRef; + // Update verse ref. Leave book alone if not empty to prevent parsing errors + // on books with bad id lines. + if (vref.Book == "" && Canon.BookIdToNumber(code) != 0) + vref.Book = code; + vref.ChapterNum = 1; + vref.VerseNum = 0; + State.VerseRef = vref; + State.VerseOffset = 0; + + // Book start. + Handler?.StartBook(State, token.Marker, code); + break; + case UsfmTokenType.Chapter: + // Get alternate chapter number + string altChapter = null; + string pubChapter = null; + if ( + State.Index < State.Tokens.Count - 3 + && State.Tokens[State.Index + 1].Marker == "ca" + && State.Tokens[State.Index + 2].Text != null + && State.Tokens[State.Index + 3].Marker == "ca*" + ) + { + altChapter = State.Tokens[State.Index + 2].Text.Trim(); + State.SpecialTokenCount += 3; + + // Skip blank space after if present if ( - State.Index + State.SpecialTokenCount < State.Tokens.Count - 3 - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "vp" - && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null - && State.Tokens[State.Index + State.SpecialTokenCount + 3].Marker == "vp*" + State.Index + State.SpecialTokenCount < State.Tokens.Count - 1 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text != null + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text.Trim().Length == 0 ) { - // Get publishable verse number - pubVerse = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); - State.SpecialTokenCount += 3; + State.SpecialTokenCount++; } - - // Verse - vref = State.VerseRef; - vref.Verse = token.Data; - State.VerseRef = vref; + } + + // Get publishable chapter number + if ( + State.Index + State.SpecialTokenCount < State.Tokens.Count - 2 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "cp" + && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null + ) + { + pubChapter = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); + State.SpecialTokenCount += 2; + } + + // Chapter + vref = State.VerseRef; + vref.Chapter = token.Data; + vref.VerseNum = 0; + State.VerseRef = vref; + // Verse offset is not zeroed for chapter 1, as it is part of intro + if (State.VerseRef.ChapterNum != 1) State.VerseOffset = 0; - Handler?.Verse(State, token.Data, token.Marker, altVerse, pubVerse); - break; - case UsfmTokenType.Paragraph: - // Handle special case of table rows - if (token.Marker == "tr") - { - // Start table if not open - if (State.Stack.All(e => e.Type != UsfmElementType.Table)) - { - State.Push(new UsfmParserElement(UsfmElementType.Table, null)); - Handler?.StartTable(State); - } - - State.Push(new UsfmParserElement(UsfmElementType.Row, token.Marker)); - - // Row start - Handler?.StartRow(State, token.Marker); - break; - } - - // Handle special case of sidebars - if (token.Marker == "esb") - { - State.Push(new UsfmParserElement(UsfmElementType.Sidebar, token.Marker)); - - // Look for category - string sidebarCategory = null; - if ( - State.Index < State.Tokens.Count - 3 - && State.Tokens[State.Index + 1].Marker == "cat" - && State.Tokens[State.Index + 2].Text != null - && State.Tokens[State.Index + 3].Marker == "cat*" - ) - { - // Get category - sidebarCategory = State.Tokens[State.Index + 2].Text.Trim(); - State.SpecialTokenCount += 3; - } - - Handler?.StartSidebar(State, token.Marker, sidebarCategory); - break; - } - - // Close sidebar if in sidebar - if (token.Marker == "esbe") + Handler?.Chapter(State, token.Data, token.Marker, altChapter, pubChapter); + break; + case UsfmTokenType.Verse: + string pubVerse = null; + string altVerse = null; + if ( + State.Index < State.Tokens.Count - 3 + && State.Tokens[State.Index + 1].Marker == "va" + && State.Tokens[State.Index + 2].Text != null + && State.Tokens[State.Index + 3].Marker == "va*" + ) + { + // Get alternate verse number + altVerse = State.Tokens[State.Index + 2].Text.Trim(); + State.SpecialTokenCount += 3; + } + if ( + State.Index + State.SpecialTokenCount < State.Tokens.Count - 3 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "vp" + && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null + && State.Tokens[State.Index + State.SpecialTokenCount + 3].Marker == "vp*" + ) + { + // Get publishable verse number + pubVerse = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); + State.SpecialTokenCount += 3; + } + + // Verse + vref = State.VerseRef; + vref.Verse = token.Data; + State.VerseRef = vref; + State.VerseOffset = 0; + + Handler?.Verse(State, token.Data, token.Marker, altVerse, pubVerse); + break; + case UsfmTokenType.Paragraph: + // Handle special case of table rows + if (token.Marker == "tr") + { + // Start table if not open + if (State.Stack.All(e => e.Type != UsfmElementType.Table)) { - if (State.Stack.Any(e => e.Type == UsfmElementType.Sidebar)) - { - while (State.Stack.Count > 0) - CloseElement(State.Peek().Type == UsfmElementType.Sidebar); - } - else - { - Handler?.Unmatched(State, token.Marker); - } - break; + State.Push(new UsfmParserElement(UsfmElementType.Table, null)); + Handler?.StartTable(State); } - State.Push(new UsfmParserElement(UsfmElementType.Para, token.Marker)); + State.Push(new UsfmParserElement(UsfmElementType.Row, token.Marker)); - // Paragraph opening - Handler?.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes); + // Row start + Handler?.StartRow(State, token.Marker); break; - case UsfmTokenType.Character: - // Handle special case of table cells (treated as special character style) - if (IsCell(token)) - { - string align = "start"; - if (token.Marker.Length > 2 && token.Marker[2] == 'c') - align = "center"; - else if (token.Marker.Length > 2 && token.Marker[2] == 'r') - align = "end"; - - UsfmStylesheet.IsCellRange(token.Marker, out string baseMarker, out int colspan); - State.Push(new UsfmParserElement(UsfmElementType.Cell, baseMarker)); - - Handler?.StartCell(State, baseMarker, align, colspan); - break; - } - - if (IsRef(token)) - { - // xrefs are special tokens (they do not stand alone) - State.SpecialToken = true; - - ParseDisplayAndTarget(out string display, out string target); + } - State.SpecialTokenCount += 2; + // Handle special case of sidebars + if (token.Marker == "esb") + { + State.Push(new UsfmParserElement(UsfmElementType.Sidebar, token.Marker)); - Handler?.Ref(State, token.Marker, display, target); - break; - } - - string actualMarker; - bool invalidMarker = false; - if (token.Marker.StartsWith("+")) - { - // Only strip + if properly nested - UsfmTag charTag = State.CharTag; - actualMarker = charTag != null ? token.Marker.TrimStart('+') : token.Marker; - invalidMarker = charTag == null; - } - else - { - actualMarker = token.Marker; - } - - State.Push(new UsfmParserElement(UsfmElementType.Char, actualMarker, token.Attributes)); - Handler?.StartChar( - State, - actualMarker, - token.Type == UsfmTokenType.Unknown || invalidMarker, - token.Attributes - ); - break; - case UsfmTokenType.Note: // Look for category - string noteCategory = null; + string sidebarCategory = null; if ( State.Index < State.Tokens.Count - 3 && State.Tokens[State.Index + 1].Marker == "cat" @@ -523,66 +440,148 @@ public bool ProcessToken() ) { // Get category - noteCategory = State.Tokens[State.Index + 2].Text.Trim(); + sidebarCategory = State.Tokens[State.Index + 2].Text.Trim(); State.SpecialTokenCount += 3; } - State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker)); - - Handler?.StartNote(State, token.Marker, token.Data, noteCategory); + Handler?.StartSidebar(State, token.Marker, sidebarCategory); break; - case UsfmTokenType.Text: - string text = token.Text; + } - // If last token before a paragraph, book or chapter, esb, esbe (both are paragraph types), - // or at very end, strip final space - // This is because USFM requires these to be on a new line, therefore adding whitespace - if ( - ( - State.Index == State.Tokens.Count - 1 - || State.Tokens[State.Index + 1].Type == UsfmTokenType.Paragraph - || State.Tokens[State.Index + 1].Type == UsfmTokenType.Book - || State.Tokens[State.Index + 1].Type == UsfmTokenType.Chapter - ) - && text.Length > 0 - && text[text.Length - 1] == ' ' - ) + // Close sidebar if in sidebar + if (token.Marker == "esbe") + { + if (State.Stack.Any(e => e.Type == UsfmElementType.Sidebar)) { - text = text.Substring(0, text.Length - 1); + while (State.Stack.Count > 0) + CloseElement(State.Peek().Type == UsfmElementType.Sidebar); } - - if (Handler != null) + else { - // Replace ~ with nbsp - text = text.Replace('~', '\u00A0'); - - // Replace // with - foreach (string str in OptBreakSplitter.Split(text)) - { - if (str == "//") - Handler.OptBreak(State); - else - Handler.Text(State, str); - } + Handler?.Unmatched(State, token.Marker); } break; + } + + State.Push(new UsfmParserElement(UsfmElementType.Para, token.Marker)); + + // Paragraph opening + Handler?.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes); + break; + case UsfmTokenType.Character: + // Handle special case of table cells (treated as special character style) + if (IsCell(token)) + { + string align = "start"; + if (token.Marker.Length > 2 && token.Marker[2] == 'c') + align = "center"; + else if (token.Marker.Length > 2 && token.Marker[2] == 'r') + align = "end"; + + UsfmStylesheet.IsCellRange(token.Marker, out string baseMarker, out int colspan); + State.Push(new UsfmParserElement(UsfmElementType.Cell, baseMarker)); + + Handler?.StartCell(State, baseMarker, align, colspan); + break; + } + + if (IsRef(token)) + { + // xrefs are special tokens (they do not stand alone) + State.SpecialToken = true; + + ParseDisplayAndTarget(out string display, out string target); + + State.SpecialTokenCount += 2; - case UsfmTokenType.Milestone: - case UsfmTokenType.MilestoneEnd: - // currently, parse state doesn't need to be update, so just inform the handler about the milestone. - Handler?.Milestone( - State, - token.Marker, - token.Type == UsfmTokenType.Milestone, - token.Attributes - ); + Handler?.Ref(State, token.Marker, display, target); break; - } - } - catch (Exception e) - { - throw new UsfmParsingException(State, e); + } + + string actualMarker; + bool invalidMarker = false; + if (token.Marker.StartsWith("+")) + { + // Only strip + if properly nested + UsfmTag charTag = State.CharTag; + actualMarker = charTag != null ? token.Marker.TrimStart('+') : token.Marker; + invalidMarker = charTag == null; + } + else + { + actualMarker = token.Marker; + } + + State.Push(new UsfmParserElement(UsfmElementType.Char, actualMarker, token.Attributes)); + Handler?.StartChar( + State, + actualMarker, + token.Type == UsfmTokenType.Unknown || invalidMarker, + token.Attributes + ); + break; + case UsfmTokenType.Note: + // Look for category + string noteCategory = null; + if ( + State.Index < State.Tokens.Count - 3 + && State.Tokens[State.Index + 1].Marker == "cat" + && State.Tokens[State.Index + 2].Text != null + && State.Tokens[State.Index + 3].Marker == "cat*" + ) + { + // Get category + noteCategory = State.Tokens[State.Index + 2].Text.Trim(); + State.SpecialTokenCount += 3; + } + + State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker)); + + Handler?.StartNote(State, token.Marker, token.Data, noteCategory); + break; + case UsfmTokenType.Text: + string text = token.Text; + + // If last token before a paragraph, book or chapter, esb, esbe (both are paragraph types), + // or at very end, strip final space + // This is because USFM requires these to be on a new line, therefore adding whitespace + if ( + ( + State.Index == State.Tokens.Count - 1 + || State.Tokens[State.Index + 1].Type == UsfmTokenType.Paragraph + || State.Tokens[State.Index + 1].Type == UsfmTokenType.Book + || State.Tokens[State.Index + 1].Type == UsfmTokenType.Chapter + ) + && text.Length > 0 + && text[text.Length - 1] == ' ' + ) + { + text = text.Substring(0, text.Length - 1); + } + + if (Handler != null) + { + // Replace ~ with nbsp + text = text.Replace('~', '\u00A0'); + + // Replace // with + foreach (string str in OptBreakSplitter.Split(text)) + { + if (str == "//") + Handler.OptBreak(State); + else + Handler.Text(State, str); + } + } + break; + + case UsfmTokenType.Milestone: + case UsfmTokenType.MilestoneEnd: + // currently, parse state doesn't need to be update, so just inform the handler about the milestone. + Handler?.Milestone(State, token.Marker, token.Type == UsfmTokenType.Milestone, token.Attributes); + break; } + return true; } From 9bbd12a5193151665dd790cadafcc08b484b60f7 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 26 Jul 2024 11:49:16 -0400 Subject: [PATCH 04/15] Change error messages for ref asserts --- tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index 890c9fc22..6c022a8da 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -28,7 +28,7 @@ public void GetRows_VerseDescriptiveTitle() Assert.That( rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), - string.Join(",", rows.ToList().Select(tr => tr.Text)) + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) ); Assert.That( rows[0].Text, @@ -55,7 +55,7 @@ public void GetRows_LastSegment() Assert.That( rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), - string.Join(",", rows.ToList().Select(tr => tr.Text)) + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) ); Assert.That( rows[0].Text, From d9d77dbde61060a965b41a601b964e088750ebd1 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Fri, 26 Jul 2024 13:44:50 -0400 Subject: [PATCH 05/15] Updates from review --- src/SIL.Machine/Corpora/UsfmParser.cs | 28 ++++++++++--------- .../Corpora/UsfmParsingException.cs | 14 ---------- .../Corpora/UsfmMemoryTextTests.cs | 17 ++++++----- 3 files changed, 25 insertions(+), 34 deletions(-) delete mode 100644 src/SIL.Machine/Corpora/UsfmParsingException.cs diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index 1047eb24d..108f12bec 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Text; using System.Text.RegularExpressions; using SIL.Scripture; @@ -42,7 +43,19 @@ public static void Parse( versification, preserveWhitespace ); - parser.ProcessTokens(); + try + { + parser.ProcessTokens(); + } + catch (Exception ex) + { + var sb = new StringBuilder(); + sb.Append( + $"An error occurred while parsing the USFM text in Verse: {parser.State.VerseRef}, line: {parser.State.LineNumber}, " + ); + sb.Append($"column: {parser.State.ColumnNumber}, error: '{ex.Message}'"); + throw new InvalidOperationException(sb.ToString(), ex); + } } private static readonly Regex OptBreakSplitter = new Regex("(//)", RegexOptions.Compiled); @@ -130,18 +143,7 @@ bool preserveWhitespace /// public void ProcessTokens() { - bool continueProcessing = true; - while (continueProcessing) - { - try - { - continueProcessing = ProcessToken(); - } - catch (Exception e) - { - throw new UsfmParsingException(State, e); - } - } + while (ProcessToken()) { } } /// diff --git a/src/SIL.Machine/Corpora/UsfmParsingException.cs b/src/SIL.Machine/Corpora/UsfmParsingException.cs deleted file mode 100644 index b3bcbbd59..000000000 --- a/src/SIL.Machine/Corpora/UsfmParsingException.cs +++ /dev/null @@ -1,14 +0,0 @@ -using System; -using System.Linq; - -namespace SIL.Machine.Corpora -{ - public class UsfmParsingException : Exception - { - public UsfmParsingException(UsfmParserState state, Exception exception) - : base( - $"Failed to parse at line {state.LineNumber} column {state.ColumnNumber} verse ref {state.VerseRef} with surrounding tokens [{string.Join(",", state.Tokens.ToList().GetRange(Math.Max(state.Index - 3, 0), Math.Min(7, state.Tokens.Count - (state.Index - 3))).Select(t => $"{t.Text} (TokenType={t.Type})"))}]", - exception - ) { } - } -} diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index 6c022a8da..3cb84df17 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -92,19 +92,22 @@ public void GetRows_TriplicateVerse() TextRow[] rows = GetRows( @"\id MAT - Test \c 1 -\v 1 First verse -\rem non verse -\v 1 First verse -\rem non verse -\v 1 First verse +\v 1 First verse 1 +\rem non verse 1 +\v 1 First verse 2 +\rem non verse 2 +\v 1 First verse 3 +\rem non verse 3 \v 2 Second verse ", includeAllText: true ); Assert.Multiple(() => { - Assert.That(rows[0].Text, Is.EqualTo("First verse"), string.Join(",", rows.ToList().Select(tr => tr.Text))); - Assert.That(rows, Has.Length.EqualTo(4), string.Join(",", rows.ToList().Select(tr => tr.Text))); + Assert.That(rows, Has.Length.EqualTo(5), string.Join(",", rows.ToList().Select(tr => tr.Text))); + Assert.That(rows[0].Text, Is.EqualTo("First verse 1")); + Assert.That(rows[3].Text, Is.EqualTo("non verse 3")); + Assert.That(rows[4].Text, Is.EqualTo("Second verse")); }); } From b4345fc512d844afe97c51e1a5c7028d116cbf5f Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 26 Jul 2024 15:36:51 -0400 Subject: [PATCH 06/15] Remove commented out code --- src/SIL.Machine/Corpora/UsfmTextUpdater.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs index 4ac11cd78..6d6cf01d8 100644 --- a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs +++ b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs @@ -393,7 +393,6 @@ private void PushTokensAsPrevious() private void PopNewTokens() { - // if (_replace.Any()) _replace.Pop(); } } From d2db1aa0da1175fc81ff74186690b6a8a870e719 Mon Sep 17 00:00:00 2001 From: Damien Daspit Date: Mon, 29 Jul 2024 15:22:20 -0500 Subject: [PATCH 07/15] Add ParatextProjectTextUpdaterBase class --- .../Corpora/FileParatextProjectTextUpdater.cs | 25 ++++++++++ .../Corpora/ParatextProjectTextUpdaterBase.cs | 48 +++++++++++++++++++ ...tUpdater.cs => UpdateUsfmParserHandler.cs} | 4 +- ...sts.cs => UpdateUsfmParserHandlerTests.cs} | 4 +- .../Corpora/UsfmManualTests.cs | 4 +- 5 files changed, 79 insertions(+), 6 deletions(-) create mode 100644 src/SIL.Machine/Corpora/FileParatextProjectTextUpdater.cs create mode 100644 src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs rename src/SIL.Machine/Corpora/{UsfmTextUpdater.cs => UpdateUsfmParserHandler.cs} (99%) rename tests/SIL.Machine.Tests/Corpora/{UsfmTextUpdaterTests.cs => UpdateUsfmParserHandlerTests.cs} (99%) diff --git a/src/SIL.Machine/Corpora/FileParatextProjectTextUpdater.cs b/src/SIL.Machine/Corpora/FileParatextProjectTextUpdater.cs new file mode 100644 index 000000000..c9c9dd958 --- /dev/null +++ b/src/SIL.Machine/Corpora/FileParatextProjectTextUpdater.cs @@ -0,0 +1,25 @@ +using System.IO; + +namespace SIL.Machine.Corpora +{ + public class FileParatextProjectTextUpdater : ParatextProjectTextUpdaterBase + { + private readonly string _projectDir; + + public FileParatextProjectTextUpdater(string projectDir) + : base(new FileParatextProjectSettingsParser(projectDir)) + { + _projectDir = projectDir; + } + + protected override bool Exists(string fileName) + { + return File.Exists(Path.Combine(_projectDir, fileName)); + } + + protected override Stream Open(string fileName) + { + return File.OpenRead(Path.Combine(_projectDir, fileName)); + } + } +} diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs new file mode 100644 index 000000000..7cacc7df1 --- /dev/null +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -0,0 +1,48 @@ +using System.Collections.Generic; +using System.IO; + +namespace SIL.Machine.Corpora +{ + public abstract class ParatextProjectTextUpdaterBase + { + private readonly ParatextProjectSettingsParserBase _settingsParser; + + protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase settingsParser) + { + _settingsParser = settingsParser; + } + + public string UpdateUsfm( + string bookId, + IReadOnlyList<(IReadOnlyList, string)> rows, + string fullName = null, + bool stripAllText = false, + bool preferExistingText = true + ) + { + ParatextProjectSettings settings = _settingsParser.Parse(); + + string fileName = settings.GetBookFileName(bookId); + if (!Exists(fileName)) + return null; + + string usfm; + using (var reader = new StreamReader(Open(fileName))) + { + usfm = reader.ReadToEnd(); + } + + var handler = new UpdateUsfmParserHandler( + rows, + fullName is null ? null : $"- {fullName}", + stripAllText, + preferExistingText: preferExistingText + ); + UsfmParser.Parse(usfm, handler, settings.Stylesheet, settings.Versification); + return handler.GetUsfm(settings.Stylesheet); + } + + protected abstract bool Exists(string fileName); + protected abstract Stream Open(string fileName); + } +} diff --git a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs similarity index 99% rename from src/SIL.Machine/Corpora/UsfmTextUpdater.cs rename to src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 6d6cf01d8..ce16c03d3 100644 --- a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -8,7 +8,7 @@ namespace SIL.Machine.Corpora * This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified * text. */ - public class UsfmTextUpdater : ScriptureRefUsfmParserHandlerBase + public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase { private readonly IReadOnlyList<(IReadOnlyList, string)> _rows; private readonly List _tokens; @@ -20,7 +20,7 @@ public class UsfmTextUpdater : ScriptureRefUsfmParserHandlerBase private int _rowIndex; private int _tokenIndex; - public UsfmTextUpdater( + public UpdateUsfmParserHandler( IReadOnlyList<(IReadOnlyList, string)> rows = null, string idText = null, bool stripAllText = false, diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs similarity index 99% rename from tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs rename to tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 4264fbe3d..ad6ce166c 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -3,7 +3,7 @@ namespace SIL.Machine.Corpora; [TestFixture] -public class UsfmTextUpdaterTests +public class UpdateUsfmParserHandlerTests { [Test] public void GetUsfm_Verse_CharStyle() @@ -446,7 +446,7 @@ private static string UpdateUsfm( source = ReadUsfm(); else source = source.Trim().ReplaceLineEndings("\r\n") + "\r\n"; - var updater = new UsfmTextUpdater( + var updater = new UpdateUsfmParserHandler( rows, idText, stripAllText: stripAllText, diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index f0b636cfb..ceaa93c75 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -43,7 +43,7 @@ string sfmFileName in Directory.EnumerateFiles( ) ) { - var updater = new UsfmTextUpdater(pretranslations, stripAllText: true, preferExistingText: false); + var updater = new UpdateUsfmParserHandler(pretranslations, stripAllText: true, preferExistingText: false); string usfm = await File.ReadAllTextAsync(sfmFileName); UsfmParser.Parse(usfm, updater, targetSettings.Stylesheet, targetSettings.Versification); string newUsfm = updater.GetUsfm(targetSettings.Stylesheet); @@ -95,7 +95,7 @@ string sfmFileName in Directory.EnumerateFiles( ) ) { - var updater = new UsfmTextUpdater(pretranslations, stripAllText: true, preferExistingText: true); + var updater = new UpdateUsfmParserHandler(pretranslations, stripAllText: true, preferExistingText: true); string usfm = await File.ReadAllTextAsync(sfmFileName); UsfmParser.Parse(usfm, updater, settings.Stylesheet, settings.Versification); string newUsfm = updater.GetUsfm(settings.Stylesheet); From b140176a1907227d5c4e5f4a83a1dd2aa637b452 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 1 Aug 2024 10:48:57 -0400 Subject: [PATCH 08/15] Draft: Fill in zip classes --- .../Corpora/ZipParatextProjecTextUpdater.cs | 29 +++++++++++++++ .../ZipParatextProjectTextUpdaterBase.cs | 8 +++++ .../Corpora/UpdateUsfmParserHandlerTests.cs | 36 +++++++++++-------- 3 files changed, 58 insertions(+), 15 deletions(-) create mode 100644 src/SIL.Machine/Corpora/ZipParatextProjecTextUpdater.cs create mode 100644 src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs diff --git a/src/SIL.Machine/Corpora/ZipParatextProjecTextUpdater.cs b/src/SIL.Machine/Corpora/ZipParatextProjecTextUpdater.cs new file mode 100644 index 000000000..c8bc8f5dd --- /dev/null +++ b/src/SIL.Machine/Corpora/ZipParatextProjecTextUpdater.cs @@ -0,0 +1,29 @@ +using System.IO; +using System.IO.Compression; + +namespace SIL.Machine.Corpora +{ + public class ZipParatextProjectTextUpdater : ZipParatextProjectTextUpdaterBase + { + private readonly ZipArchive _archive; + + public ZipParatextProjectTextUpdater(ZipArchive archive) + : base(new ZipParatextProjectSettingsParser(archive)) + { + _archive = archive; + } + + protected override bool Exists(string fileName) + { + return _archive.GetEntry(fileName) != null; + } + + protected override Stream Open(string fileName) + { + ZipArchiveEntry entry = _archive.GetEntry(fileName); + if (entry == null) + return null; + return entry.Open(); + } + } +} diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs new file mode 100644 index 000000000..02993a576 --- /dev/null +++ b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs @@ -0,0 +1,8 @@ +namespace SIL.Machine.Corpora +{ + public abstract class ZipParatextProjectTextUpdaterBase : ParatextProjectTextUpdaterBase + { + protected ZipParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase settingsParser) + : base(settingsParser) { } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index ad6ce166c..51cde1971 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -21,7 +21,7 @@ public void GetUsfm_Verse_CharStyle() [Test] public void GetUsfm_IdText() { - string target = UpdateUsfm(idText: "- Updated"); + string target = UpdateUsfm(idText: "Updated"); Assert.That(target, Contains.Substring("\\id MAT - Updated\r\n")); } @@ -443,21 +443,27 @@ private static string UpdateUsfm( ) { if (source is null) - source = ReadUsfm(); + { + var updater = new FileParatextProjectTextUpdater(CorporaTestHelpers.UsfmTestProjectPath); + return updater.UpdateUsfm( + "MAT", + rows, + fullName: idText, + stripAllText: stripAllText, + preferExistingText: preferExistingText + ); + } else + { source = source.Trim().ReplaceLineEndings("\r\n") + "\r\n"; - var updater = new UpdateUsfmParserHandler( - rows, - idText, - stripAllText: stripAllText, - preferExistingText: preferExistingText - ); - UsfmParser.Parse(source, updater); - return updater.GetUsfm(); - } - - private static string ReadUsfm() - { - return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM")); + var updater = new UpdateUsfmParserHandler( + rows, + idText, + stripAllText: stripAllText, + preferExistingText: preferExistingText + ); + UsfmParser.Parse(source, updater); + return updater.GetUsfm(); + } } } From f52fc935f17dee50806055c89ce0718fcb5aee45 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 1 Aug 2024 11:02:59 -0400 Subject: [PATCH 09/15] Use new class in manual tests --- tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index 31587503c..833b71a14 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -135,7 +135,11 @@ await Task.WhenAll( } foreach (string usfm in sfmTexts) { - var updater = new UsfmTextUpdater(pretranslations, stripAllText: true, preferExistingText: true); + var updater = new UpdateUsfmParserHandler( + pretranslations, + stripAllText: true, + preferExistingText: true + ); UsfmParser.Parse(usfm, updater, settings.Stylesheet, settings.Versification); string newUsfm = updater.GetUsfm(settings.Stylesheet); Assert.That(newUsfm, Is.Not.Null); From 6fb47eef9ded2c8ddaa71fc076f2abd07355268d Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 1 Aug 2024 11:38:55 -0400 Subject: [PATCH 10/15] Fix typo in file name --- ...atextProjecTextUpdater.cs => ZipParatextProjectTextUpdater.cs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/SIL.Machine/Corpora/{ZipParatextProjecTextUpdater.cs => ZipParatextProjectTextUpdater.cs} (100%) diff --git a/src/SIL.Machine/Corpora/ZipParatextProjecTextUpdater.cs b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs similarity index 100% rename from src/SIL.Machine/Corpora/ZipParatextProjecTextUpdater.cs rename to src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs From 6920a0ddaf55a8fa622b501281a25554a2e06157 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 1 Aug 2024 12:37:20 -0400 Subject: [PATCH 11/15] Add direct-from-settings constructor --- .../Corpora/ParatextProjectTextUpdaterBase.cs | 17 ++++++++++------- .../ZipParatextProjectTextUpdaterBase.cs | 5 ++++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 7cacc7df1..1e08242e0 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -5,11 +5,16 @@ namespace SIL.Machine.Corpora { public abstract class ParatextProjectTextUpdaterBase { - private readonly ParatextProjectSettingsParserBase _settingsParser; + private readonly ParatextProjectSettings _settings; protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase settingsParser) { - _settingsParser = settingsParser; + _settings = settingsParser.Parse(); + } + + protected ParatextProjectTextUpdaterBase(ParatextProjectSettings settings) + { + _settings = settings; } public string UpdateUsfm( @@ -20,9 +25,7 @@ public string UpdateUsfm( bool preferExistingText = true ) { - ParatextProjectSettings settings = _settingsParser.Parse(); - - string fileName = settings.GetBookFileName(bookId); + string fileName = _settings.GetBookFileName(bookId); if (!Exists(fileName)) return null; @@ -38,8 +41,8 @@ public string UpdateUsfm( stripAllText, preferExistingText: preferExistingText ); - UsfmParser.Parse(usfm, handler, settings.Stylesheet, settings.Versification); - return handler.GetUsfm(settings.Stylesheet); + UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification); + return handler.GetUsfm(_settings.Stylesheet); } protected abstract bool Exists(string fileName); diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs index 02993a576..3cf16a6e1 100644 --- a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs @@ -2,7 +2,10 @@ namespace SIL.Machine.Corpora { public abstract class ZipParatextProjectTextUpdaterBase : ParatextProjectTextUpdaterBase { - protected ZipParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase settingsParser) + protected ZipParatextProjectTextUpdaterBase(ZipParatextProjectSettingsParserBase settingsParser) : base(settingsParser) { } + + protected ZipParatextProjectTextUpdaterBase(ParatextProjectSettings settings) + : base(settings) { } } } From 1af0e3c4aab5567a7c42f9a6033ef6a39c12fbbd Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 1 Aug 2024 12:40:05 -0400 Subject: [PATCH 12/15] Remove constructor --- .../Corpora/ParatextProjectTextUpdaterBase.cs | 17 +++++++---------- .../ZipParatextProjectTextUpdaterBase.cs | 3 --- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 1e08242e0..7cacc7df1 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -5,16 +5,11 @@ namespace SIL.Machine.Corpora { public abstract class ParatextProjectTextUpdaterBase { - private readonly ParatextProjectSettings _settings; + private readonly ParatextProjectSettingsParserBase _settingsParser; protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase settingsParser) { - _settings = settingsParser.Parse(); - } - - protected ParatextProjectTextUpdaterBase(ParatextProjectSettings settings) - { - _settings = settings; + _settingsParser = settingsParser; } public string UpdateUsfm( @@ -25,7 +20,9 @@ public string UpdateUsfm( bool preferExistingText = true ) { - string fileName = _settings.GetBookFileName(bookId); + ParatextProjectSettings settings = _settingsParser.Parse(); + + string fileName = settings.GetBookFileName(bookId); if (!Exists(fileName)) return null; @@ -41,8 +38,8 @@ public string UpdateUsfm( stripAllText, preferExistingText: preferExistingText ); - UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification); - return handler.GetUsfm(_settings.Stylesheet); + UsfmParser.Parse(usfm, handler, settings.Stylesheet, settings.Versification); + return handler.GetUsfm(settings.Stylesheet); } protected abstract bool Exists(string fileName); diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs index 3cf16a6e1..9004d4213 100644 --- a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs @@ -4,8 +4,5 @@ public abstract class ZipParatextProjectTextUpdaterBase : ParatextProjectTextUpd { protected ZipParatextProjectTextUpdaterBase(ZipParatextProjectSettingsParserBase settingsParser) : base(settingsParser) { } - - protected ZipParatextProjectTextUpdaterBase(ParatextProjectSettings settings) - : base(settings) { } } } From 76812681c74cb137ed1fedee31cfa30284d21a45 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 1 Aug 2024 13:05:58 -0400 Subject: [PATCH 13/15] Add constructor --- .../Corpora/ParatextProjectTextUpdaterBase.cs | 17 ++++++++++------- .../ZipParatextProjectTextUpdaterBase.cs | 3 +++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 7cacc7df1..1e08242e0 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -5,11 +5,16 @@ namespace SIL.Machine.Corpora { public abstract class ParatextProjectTextUpdaterBase { - private readonly ParatextProjectSettingsParserBase _settingsParser; + private readonly ParatextProjectSettings _settings; protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase settingsParser) { - _settingsParser = settingsParser; + _settings = settingsParser.Parse(); + } + + protected ParatextProjectTextUpdaterBase(ParatextProjectSettings settings) + { + _settings = settings; } public string UpdateUsfm( @@ -20,9 +25,7 @@ public string UpdateUsfm( bool preferExistingText = true ) { - ParatextProjectSettings settings = _settingsParser.Parse(); - - string fileName = settings.GetBookFileName(bookId); + string fileName = _settings.GetBookFileName(bookId); if (!Exists(fileName)) return null; @@ -38,8 +41,8 @@ public string UpdateUsfm( stripAllText, preferExistingText: preferExistingText ); - UsfmParser.Parse(usfm, handler, settings.Stylesheet, settings.Versification); - return handler.GetUsfm(settings.Stylesheet); + UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification); + return handler.GetUsfm(_settings.Stylesheet); } protected abstract bool Exists(string fileName); diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs index 9004d4213..3cf16a6e1 100644 --- a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs @@ -4,5 +4,8 @@ public abstract class ZipParatextProjectTextUpdaterBase : ParatextProjectTextUpd { protected ZipParatextProjectTextUpdaterBase(ZipParatextProjectSettingsParserBase settingsParser) : base(settingsParser) { } + + protected ZipParatextProjectTextUpdaterBase(ParatextProjectSettings settings) + : base(settings) { } } } From 54f09805332231b40d74cfb4079d07e67edb0958 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 1 Aug 2024 22:26:55 -0400 Subject: [PATCH 14/15] Remove zip base class - unnecessary --- .../Corpora/ParatextProjectTextUpdaterBase.cs | 8 ++++---- .../Corpora/ZipParatextProjectTextUpdater.cs | 2 +- .../Corpora/ZipParatextProjectTextUpdaterBase.cs | 11 ----------- 3 files changed, 5 insertions(+), 16 deletions(-) delete mode 100644 src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 1e08242e0..742240400 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -7,14 +7,14 @@ public abstract class ParatextProjectTextUpdaterBase { private readonly ParatextProjectSettings _settings; - protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase settingsParser) + protected ParatextProjectTextUpdaterBase(ParatextProjectSettings settings) { - _settings = settingsParser.Parse(); + _settings = settings; } - protected ParatextProjectTextUpdaterBase(ParatextProjectSettings settings) + protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase settingsParser) { - _settings = settings; + _settings = settingsParser.Parse(); } public string UpdateUsfm( diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs index c8bc8f5dd..0eb30f567 100644 --- a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs +++ b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs @@ -3,7 +3,7 @@ namespace SIL.Machine.Corpora { - public class ZipParatextProjectTextUpdater : ZipParatextProjectTextUpdaterBase + public class ZipParatextProjectTextUpdater : ParatextProjectTextUpdaterBase { private readonly ZipArchive _archive; diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs deleted file mode 100644 index 3cf16a6e1..000000000 --- a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdaterBase.cs +++ /dev/null @@ -1,11 +0,0 @@ -namespace SIL.Machine.Corpora -{ - public abstract class ZipParatextProjectTextUpdaterBase : ParatextProjectTextUpdaterBase - { - protected ZipParatextProjectTextUpdaterBase(ZipParatextProjectSettingsParserBase settingsParser) - : base(settingsParser) { } - - protected ZipParatextProjectTextUpdaterBase(ParatextProjectSettings settings) - : base(settings) { } - } -} From 9f431f95cbd4740ab58c629fbad21fe29a75b690 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 2 Aug 2024 08:44:07 -0400 Subject: [PATCH 15/15] Move error-handling to updater base; use updater in manual test --- .../Corpora/ParatextProjectTextUpdaterBase.cs | 20 +++++- src/SIL.Machine/Corpora/UsfmParser.cs | 19 +----- .../Corpora/UsfmManualTests.cs | 67 +++++++++++-------- 3 files changed, 59 insertions(+), 47 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index 742240400..07c2ca6c0 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -1,5 +1,7 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using System.IO; +using System.Text; namespace SIL.Machine.Corpora { @@ -41,8 +43,20 @@ public string UpdateUsfm( stripAllText, preferExistingText: preferExistingText ); - UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification); - return handler.GetUsfm(_settings.Stylesheet); + try + { + UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification); + return handler.GetUsfm(_settings.Stylesheet); + } + catch (Exception ex) + { + var sb = new StringBuilder(); + sb.Append($"An error occurred while parsing the usfm for '{bookId}`"); + if (!string.IsNullOrEmpty(_settings.Name)) + sb.Append($" in project '{_settings.Name}'"); + sb.Append($". Error: '{ex.Message}'"); + throw new InvalidOperationException(sb.ToString(), ex); + } } protected abstract bool Exists(string fileName); diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index 108f12bec..8028b2fa3 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -1,7 +1,5 @@ -using System; -using System.Collections.Generic; +using System.Collections.Generic; using System.Linq; -using System.Text; using System.Text.RegularExpressions; using SIL.Scripture; @@ -43,19 +41,8 @@ public static void Parse( versification, preserveWhitespace ); - try - { - parser.ProcessTokens(); - } - catch (Exception ex) - { - var sb = new StringBuilder(); - sb.Append( - $"An error occurred while parsing the USFM text in Verse: {parser.State.VerseRef}, line: {parser.State.LineNumber}, " - ); - sb.Append($"column: {parser.State.ColumnNumber}, error: '{ex.Message}'"); - throw new InvalidOperationException(sb.ToString(), ex); - } + + parser.ProcessTokens(); } private static readonly Regex OptBreakSplitter = new Regex("(//)", RegexOptions.Compiled); diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index 833b71a14..88773c858 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -9,7 +9,7 @@ public class UsfmManualTests { [Test] [Ignore("This is for manual testing only. Remove this tag to run the test.")] - public async Task ParseParallelCorpusAsync() + public void ParseParallelCorpusAsync() { ParatextTextCorpus tCorpus = new(projectDir: CorporaTestHelpers.UsfmTargetProjectPath, includeAllText: true, includeMarkers: true); @@ -36,18 +36,20 @@ public async Task ParseParallelCorpusAsync() ParatextProjectSettings targetSettings = new FileParatextProjectSettingsParser( CorporaTestHelpers.UsfmTargetProjectPath ).Parse(); - + var updater = new FileParatextProjectTextUpdater(CorporaTestHelpers.UsfmTargetProjectPath); foreach ( - string sfmFileName in Directory.EnumerateFiles( - CorporaTestHelpers.UsfmTargetProjectPath, - $"{targetSettings.FileNamePrefix}*{targetSettings.FileNameSuffix}" - ) + string sfmFileName in Directory + .EnumerateFiles( + CorporaTestHelpers.UsfmTargetProjectPath, + $"{targetSettings.FileNamePrefix}*{targetSettings.FileNameSuffix}" + ) + .Select(path => new DirectoryInfo(path).Name) ) { - var updater = new UpdateUsfmParserHandler(pretranslations, stripAllText: true, preferExistingText: false); - string usfm = await File.ReadAllTextAsync(sfmFileName); - UsfmParser.Parse(usfm, updater, targetSettings.Stylesheet, targetSettings.Versification); - string newUsfm = updater.GetUsfm(targetSettings.Stylesheet); + string bookId; + if (!targetSettings.IsBookFileName(sfmFileName, out bookId)) + continue; + string newUsfm = updater.UpdateUsfm(bookId, pretranslations, stripAllText: true, preferExistingText: false); Assert.That(newUsfm, Is.Not.Null); } } @@ -105,43 +107,52 @@ async Task GetUsfmAsync(string projectPath) ) ) .ToArrayAsync(); - List sfmTexts = []; + List bookIds = []; + ParatextProjectTextUpdaterBase updater; if (projectArchive == null) { - sfmTexts = ( - await Task.WhenAll( - Directory - .EnumerateFiles(projectPath, $"{settings.FileNamePrefix}*{settings.FileNameSuffix}") - .Select(async sfmFileName => await File.ReadAllTextAsync(sfmFileName)) - ) + bookIds = ( + Directory + .EnumerateFiles(projectPath, $"{settings.FileNamePrefix}*{settings.FileNameSuffix}") + .Select(path => new DirectoryInfo(path).Name) + .Select(filename => + { + string bookId; + if (settings.IsBookFileName(filename, out bookId)) + return bookId; + else + return ""; + }) + .Where(id => id != "") ).ToList(); + updater = new FileParatextProjectTextUpdater(projectPath); } else { - sfmTexts = projectArchive + bookIds = projectArchive .Entries.Where(e => e.Name.StartsWith(settings.FileNamePrefix) && e.Name.EndsWith(settings.FileNameSuffix) ) .Select(e => { - string contents; - using (var sr = new StreamReader(e.Open())) - { - contents = sr.ReadToEnd(); - } - return contents; + string bookId; + if (settings.IsBookFileName(e.Name, out bookId)) + return bookId; + else + return ""; }) + .Where(id => id != "") .ToList(); + updater = new ZipParatextProjectTextUpdater(projectArchive); } - foreach (string usfm in sfmTexts) + foreach (string bookId in bookIds) { - var updater = new UpdateUsfmParserHandler( + string newUsfm = updater.UpdateUsfm( + bookId, pretranslations, stripAllText: true, preferExistingText: true ); - UsfmParser.Parse(usfm, updater, settings.Stylesheet, settings.Versification); - string newUsfm = updater.GetUsfm(settings.Stylesheet); Assert.That(newUsfm, Is.Not.Null); } }