From cd90626a3dd866c75697a30f9ba1ab67937a21b3 Mon Sep 17 00:00:00 2001 From: "Eli C. Lowry" <83078660+Enkidu93@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:40:04 -0400 Subject: [PATCH] Fix USFM parsing bugs (#229) * *Add custom exception for parsing *Fix off-by-one error *Handle triplicate, quadruplicate, n-plicate verses *Add test to cover triplicate verse * Update test; add better error messages to tests * Move try-catch to ProcessTokens * Change error messages for ref asserts * Updates from review * Remove commented out code * Add ParatextProjectTextUpdaterBase class * Draft: Fill in zip classes * Use new class in manual tests * Fix typo in file name * Add direct-from-settings constructor * Remove constructor * Add constructor * Remove zip base class - unnecessary * Move error-handling to updater base; use updater in manual test --------- Co-authored-by: John Lambert Co-authored-by: Damien Daspit --- .../Corpora/FileParatextProjectTextUpdater.cs | 25 +++++++ .../Corpora/ParatextProjectTextUpdaterBase.cs | 65 +++++++++++++++++ .../ScriptureRefUsfmParserHandlerBase.cs | 2 +- ...tUpdater.cs => UpdateUsfmParserHandler.cs} | 6 +- src/SIL.Machine/Corpora/UsfmParser.cs | 1 + .../Corpora/ZipParatextProjectTextUpdater.cs | 29 ++++++++ ...sts.cs => UpdateUsfmParserHandlerTests.cs} | 38 +++++----- .../Corpora/UsfmManualTests.cs | 71 +++++++++++-------- .../Corpora/UsfmMemoryTextTests.cs | 53 ++++++++++++-- 9 files changed, 236 insertions(+), 54 deletions(-) create mode 100644 src/SIL.Machine/Corpora/FileParatextProjectTextUpdater.cs create mode 100644 src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs rename src/SIL.Machine/Corpora/{UsfmTextUpdater.cs => UpdateUsfmParserHandler.cs} (99%) create mode 100644 src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs rename tests/SIL.Machine.Tests/Corpora/{UsfmTextUpdaterTests.cs => UpdateUsfmParserHandlerTests.cs} (94%) diff --git a/src/SIL.Machine/Corpora/FileParatextProjectTextUpdater.cs b/src/SIL.Machine/Corpora/FileParatextProjectTextUpdater.cs new file mode 100644 index 000000000..c9c9dd958 --- /dev/null +++ b/src/SIL.Machine/Corpora/FileParatextProjectTextUpdater.cs @@ -0,0 +1,25 @@ +using System.IO; + +namespace SIL.Machine.Corpora +{ + public class FileParatextProjectTextUpdater : ParatextProjectTextUpdaterBase + { + private readonly string _projectDir; + + public FileParatextProjectTextUpdater(string projectDir) + : base(new FileParatextProjectSettingsParser(projectDir)) + { + _projectDir = projectDir; + } + + protected override bool Exists(string fileName) + { + return File.Exists(Path.Combine(_projectDir, fileName)); + } + + protected override Stream Open(string fileName) + { + return File.OpenRead(Path.Combine(_projectDir, fileName)); + } + } +} diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs new file mode 100644 index 000000000..07c2ca6c0 --- /dev/null +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -0,0 +1,65 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace SIL.Machine.Corpora +{ + public abstract class ParatextProjectTextUpdaterBase + { + private readonly ParatextProjectSettings _settings; + + protected ParatextProjectTextUpdaterBase(ParatextProjectSettings settings) + { + _settings = settings; + } + + protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase settingsParser) + { + _settings = settingsParser.Parse(); + } + + public string UpdateUsfm( + string bookId, + IReadOnlyList<(IReadOnlyList, string)> rows, + string fullName = null, + bool stripAllText = false, + bool preferExistingText = true + ) + { + string fileName = _settings.GetBookFileName(bookId); + if (!Exists(fileName)) + return null; + + string usfm; + using (var reader = new StreamReader(Open(fileName))) + { + usfm = reader.ReadToEnd(); + } + + var handler = new UpdateUsfmParserHandler( + rows, + fullName is null ? null : $"- {fullName}", + stripAllText, + preferExistingText: preferExistingText + ); + try + { + UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification); + return handler.GetUsfm(_settings.Stylesheet); + } + catch (Exception ex) + { + var sb = new StringBuilder(); + sb.Append($"An error occurred while parsing the usfm for '{bookId}`"); + if (!string.IsNullOrEmpty(_settings.Name)) + sb.Append($" in project '{_settings.Name}'"); + sb.Append($". Error: '{ex.Message}'"); + throw new InvalidOperationException(sb.ToString(), ex); + } + } + + protected abstract bool Exists(string fileName); + protected abstract Stream Open(string fileName); + } +} diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index 7d9e3391d..f7e9d5b73 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -53,7 +53,7 @@ public override void Verse( string pubNumber ) { - if (state.VerseRef.Equals(_curVerseRef)) + if (state.VerseRef.Equals(_curVerseRef) && !_duplicateVerse) { EndVerseText(state, CreateVerseRefs()); // ignore duplicate verses diff --git a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs similarity index 99% rename from src/SIL.Machine/Corpora/UsfmTextUpdater.cs rename to src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 9265a3178..ce16c03d3 100644 --- a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -8,7 +8,7 @@ namespace SIL.Machine.Corpora * This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified * text. */ - public class UsfmTextUpdater : ScriptureRefUsfmParserHandlerBase + public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase { private readonly IReadOnlyList<(IReadOnlyList, string)> _rows; private readonly List _tokens; @@ -20,7 +20,7 @@ public class UsfmTextUpdater : ScriptureRefUsfmParserHandlerBase private int _rowIndex; private int _tokenIndex; - public UsfmTextUpdater( + public UpdateUsfmParserHandler( IReadOnlyList<(IReadOnlyList, string)> rows = null, string idText = null, bool stripAllText = false, @@ -361,7 +361,7 @@ private void SkipTokens(UsfmParserState state) private bool ReplaceWithNewTokens(UsfmParserState state) { bool newText = _replace.Count > 0 && _replace.Peek(); - int tokenEnd = state.Index + state.SpecialTokenCount + 1; + int tokenEnd = state.Index + state.SpecialTokenCount; bool existingText = false; for (int index = _tokenIndex; index <= tokenEnd; index++) { diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index c17afb387..8028b2fa3 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -41,6 +41,7 @@ public static void Parse( versification, preserveWhitespace ); + parser.ProcessTokens(); } diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs new file mode 100644 index 000000000..0eb30f567 --- /dev/null +++ b/src/SIL.Machine/Corpora/ZipParatextProjectTextUpdater.cs @@ -0,0 +1,29 @@ +using System.IO; +using System.IO.Compression; + +namespace SIL.Machine.Corpora +{ + public class ZipParatextProjectTextUpdater : ParatextProjectTextUpdaterBase + { + private readonly ZipArchive _archive; + + public ZipParatextProjectTextUpdater(ZipArchive archive) + : base(new ZipParatextProjectSettingsParser(archive)) + { + _archive = archive; + } + + protected override bool Exists(string fileName) + { + return _archive.GetEntry(fileName) != null; + } + + protected override Stream Open(string fileName) + { + ZipArchiveEntry entry = _archive.GetEntry(fileName); + if (entry == null) + return null; + return entry.Open(); + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs similarity index 94% rename from tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs rename to tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 4264fbe3d..51cde1971 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmTextUpdaterTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -3,7 +3,7 @@ namespace SIL.Machine.Corpora; [TestFixture] -public class UsfmTextUpdaterTests +public class UpdateUsfmParserHandlerTests { [Test] public void GetUsfm_Verse_CharStyle() @@ -21,7 +21,7 @@ public void GetUsfm_Verse_CharStyle() [Test] public void GetUsfm_IdText() { - string target = UpdateUsfm(idText: "- Updated"); + string target = UpdateUsfm(idText: "Updated"); Assert.That(target, Contains.Substring("\\id MAT - Updated\r\n")); } @@ -443,21 +443,27 @@ private static string UpdateUsfm( ) { if (source is null) - source = ReadUsfm(); + { + var updater = new FileParatextProjectTextUpdater(CorporaTestHelpers.UsfmTestProjectPath); + return updater.UpdateUsfm( + "MAT", + rows, + fullName: idText, + stripAllText: stripAllText, + preferExistingText: preferExistingText + ); + } else + { source = source.Trim().ReplaceLineEndings("\r\n") + "\r\n"; - var updater = new UsfmTextUpdater( - rows, - idText, - stripAllText: stripAllText, - preferExistingText: preferExistingText - ); - UsfmParser.Parse(source, updater); - return updater.GetUsfm(); - } - - private static string ReadUsfm() - { - return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM")); + var updater = new UpdateUsfmParserHandler( + rows, + idText, + stripAllText: stripAllText, + preferExistingText: preferExistingText + ); + UsfmParser.Parse(source, updater); + return updater.GetUsfm(); + } } } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index 6d81ae0b2..88773c858 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -9,7 +9,7 @@ public class UsfmManualTests { [Test] [Ignore("This is for manual testing only. Remove this tag to run the test.")] - public async Task ParseParallelCorpusAsync() + public void ParseParallelCorpusAsync() { ParatextTextCorpus tCorpus = new(projectDir: CorporaTestHelpers.UsfmTargetProjectPath, includeAllText: true, includeMarkers: true); @@ -36,18 +36,20 @@ public async Task ParseParallelCorpusAsync() ParatextProjectSettings targetSettings = new FileParatextProjectSettingsParser( CorporaTestHelpers.UsfmTargetProjectPath ).Parse(); - + var updater = new FileParatextProjectTextUpdater(CorporaTestHelpers.UsfmTargetProjectPath); foreach ( - string sfmFileName in Directory.EnumerateFiles( - CorporaTestHelpers.UsfmTargetProjectPath, - $"{targetSettings.FileNamePrefix}*{targetSettings.FileNameSuffix}" - ) + string sfmFileName in Directory + .EnumerateFiles( + CorporaTestHelpers.UsfmTargetProjectPath, + $"{targetSettings.FileNamePrefix}*{targetSettings.FileNameSuffix}" + ) + .Select(path => new DirectoryInfo(path).Name) ) { - var updater = new UsfmTextUpdater(pretranslations, stripAllText: true, preferExistingText: false); - string usfm = await File.ReadAllTextAsync(sfmFileName); - UsfmParser.Parse(usfm, updater, targetSettings.Stylesheet, targetSettings.Versification); - string newUsfm = updater.GetUsfm(targetSettings.Stylesheet); + string bookId; + if (!targetSettings.IsBookFileName(sfmFileName, out bookId)) + continue; + string newUsfm = updater.UpdateUsfm(bookId, pretranslations, stripAllText: true, preferExistingText: false); Assert.That(newUsfm, Is.Not.Null); } } @@ -105,39 +107,52 @@ async Task GetUsfmAsync(string projectPath) ) ) .ToArrayAsync(); - List sfmTexts = []; + List bookIds = []; + ParatextProjectTextUpdaterBase updater; if (projectArchive == null) { - sfmTexts = ( - await Task.WhenAll( - Directory - .EnumerateFiles(projectPath, $"{settings.FileNamePrefix}*{settings.FileNameSuffix}") - .Select(async sfmFileName => await File.ReadAllTextAsync(sfmFileName)) - ) + bookIds = ( + Directory + .EnumerateFiles(projectPath, $"{settings.FileNamePrefix}*{settings.FileNameSuffix}") + .Select(path => new DirectoryInfo(path).Name) + .Select(filename => + { + string bookId; + if (settings.IsBookFileName(filename, out bookId)) + return bookId; + else + return ""; + }) + .Where(id => id != "") ).ToList(); + updater = new FileParatextProjectTextUpdater(projectPath); } else { - sfmTexts = projectArchive + bookIds = projectArchive .Entries.Where(e => e.Name.StartsWith(settings.FileNamePrefix) && e.Name.EndsWith(settings.FileNameSuffix) ) .Select(e => { - string contents; - using (var sr = new StreamReader(e.Open())) - { - contents = sr.ReadToEnd(); - } - return contents; + string bookId; + if (settings.IsBookFileName(e.Name, out bookId)) + return bookId; + else + return ""; }) + .Where(id => id != "") .ToList(); + updater = new ZipParatextProjectTextUpdater(projectArchive); } - foreach (string usfm in sfmTexts) + foreach (string bookId in bookIds) { - var updater = new UsfmTextUpdater(pretranslations, stripAllText: true, preferExistingText: true); - UsfmParser.Parse(usfm, updater, settings.Stylesheet, settings.Versification); - string newUsfm = updater.GetUsfm(settings.Stylesheet); + string newUsfm = updater.UpdateUsfm( + bookId, + pretranslations, + stripAllText: true, + preferExistingText: true + ); Assert.That(newUsfm, Is.Not.Null); } } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index b046be229..3cb84df17 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -25,8 +25,16 @@ public void GetRows_VerseDescriptiveTitle() { Assert.That(rows, Has.Length.EqualTo(1)); - Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1"))); - Assert.That(rows[0].Text, Is.EqualTo("Descriptive title")); + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That( + rows[0].Text, + Is.EqualTo("Descriptive title"), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); }); } @@ -44,8 +52,16 @@ public void GetRows_LastSegment() { Assert.That(rows, Has.Length.EqualTo(1)); - Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1"))); - Assert.That(rows[0].Text, Is.EqualTo("Last segment")); + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That( + rows[0].Text, + Is.EqualTo("Last segment"), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); }); } @@ -67,7 +83,32 @@ public void GetRows_DuplicateVerseWithTable() includeAllText: true ); - Assert.That(rows, Has.Length.EqualTo(5)); + Assert.That(rows, Has.Length.EqualTo(5), string.Join(",", rows.ToList().Select(tr => tr.Text))); + } + + [Test] + public void GetRows_TriplicateVerse() + { + TextRow[] rows = GetRows( + @"\id MAT - Test +\c 1 +\v 1 First verse 1 +\rem non verse 1 +\v 1 First verse 2 +\rem non verse 2 +\v 1 First verse 3 +\rem non verse 3 +\v 2 Second verse +", + includeAllText: true + ); + Assert.Multiple(() => + { + Assert.That(rows, Has.Length.EqualTo(5), string.Join(",", rows.ToList().Select(tr => tr.Text))); + Assert.That(rows[0].Text, Is.EqualTo("First verse 1")); + Assert.That(rows[3].Text, Is.EqualTo("non verse 3")); + Assert.That(rows[4].Text, Is.EqualTo("Second verse")); + }); } [Test] @@ -88,7 +129,7 @@ public void GetRows_VersePara_BeginningNonVerseSegment() includeAllText: true ); - Assert.That(rows, Has.Length.EqualTo(4)); + Assert.That(rows, Has.Length.EqualTo(4), string.Join(",", rows.ToList().Select(tr => tr.Text))); } private static TextRow[] GetRows(string usfm, bool includeMarkers = false, bool includeAllText = false)