diff --git a/src/SIL.Machine/Corpora/UsxFileAlignmentCollection.cs b/src/SIL.Machine/Corpora/UsxFileAlignmentCollection.cs index 1af7062f..59488b4e 100644 --- a/src/SIL.Machine/Corpora/UsxFileAlignmentCollection.cs +++ b/src/SIL.Machine/Corpora/UsxFileAlignmentCollection.cs @@ -182,19 +182,19 @@ private static Dictionary> GetLinks( IReadOnlyList tokens ) { - XElement prevParaElem = null; + XElement prevParentElement = null; var sb = new StringBuilder(); var linkStrs = new List<(Range, string)>(); foreach (UsxToken token in tokens) { - if (token.ParaElement != prevParaElem && sb.Length > 0) + if (token.ParentElement != prevParentElement && sb.Length > 0) sb.Append(" "); int start = sb.Length; sb.Append(token); if (token.Element is XElement e && e.Name == "wg") linkStrs.Add((Range.Create(start, sb.Length), (string)e.Attribute("target_links"))); - prevParaElem = token.ParaElement; + prevParentElement = token.ParentElement; } string text = sb.ToString().Trim(); diff --git a/src/SIL.Machine/Corpora/UsxMemoryText.cs b/src/SIL.Machine/Corpora/UsxMemoryText.cs new file mode 100644 index 00000000..6eff0b6d --- /dev/null +++ b/src/SIL.Machine/Corpora/UsxMemoryText.cs @@ -0,0 +1,20 @@ +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class UsxMemoryText : UsxTextBase + { + private readonly string _usx; + + public UsxMemoryText(string id, string usx, ScrVers versification = null) + : base(id, versification) + { + _usx = usx; + } + + protected override IStreamContainer CreateStreamContainer() + { + return new MemoryStreamContainer(_usx); + } + } +} diff --git a/src/SIL.Machine/Corpora/UsxToken.cs b/src/SIL.Machine/Corpora/UsxToken.cs index 47ec1043..7743543a 100644 --- a/src/SIL.Machine/Corpora/UsxToken.cs +++ b/src/SIL.Machine/Corpora/UsxToken.cs @@ -4,14 +4,14 @@ namespace SIL.Machine.Corpora { public class UsxToken { - public UsxToken(XElement paraElem, string text, XElement elem = null) + public UsxToken(XElement parentElement, string text, XElement elem = null) { - ParaElement = paraElem; + ParentElement = parentElement; Text = text; Element = elem; } - public XElement ParaElement { get; } + public XElement ParentElement { get; } public string Text { get; } public XElement Element { get; } diff --git a/src/SIL.Machine/Corpora/UsxVerse.cs b/src/SIL.Machine/Corpora/UsxVerse.cs index 25bed852..a7b258b8 100644 --- a/src/SIL.Machine/Corpora/UsxVerse.cs +++ b/src/SIL.Machine/Corpora/UsxVerse.cs @@ -29,8 +29,15 @@ public UsxVerse(string chapter, string verse, bool isSentenceStart, IEnumerable< if (token.Text.Length == 0 || token.Text.StartsWith("\n")) continue; - if (prevToken != null && token.ParaElement != prevToken.ParaElement && sb.Length > 0 && !endsWithSpace) + if ( + prevToken != null + && token.ParentElement != prevToken.ParentElement + && sb.Length > 0 + && !endsWithSpace + ) + { sb.Append(" "); + } sb.Append(token); endsWithSpace = token.Text.EndsWith(" "); diff --git a/src/SIL.Machine/Corpora/UsxVerseParser.cs b/src/SIL.Machine/Corpora/UsxVerseParser.cs index e55ec83f..58a05dd5 100644 --- a/src/SIL.Machine/Corpora/UsxVerseParser.cs +++ b/src/SIL.Machine/Corpora/UsxVerseParser.cs @@ -1,6 +1,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using System.Text.RegularExpressions; using System.Xml.Linq; using SIL.Machine.Utils; using SIL.Scripture; @@ -9,18 +10,40 @@ namespace SIL.Machine.Corpora { public class UsxVerseParser { - private static readonly HashSet NonVerseParaStyles = new HashSet + private static readonly HashSet VerseParaStyles = new HashSet { - "ms", - "mr", - "s", - "sr", - "r", + // Paragraphs + "p", + "m", + "po", + "pr", + "cls", + "pmo", + "pm", + "pmc", + "pmr", + "pi", + "pc", + "mi", + "nb", + // Poetry + "q", + "qc", + "qr", + "qm", + "qd", + "b", "d", - "sp", - "rem", - "restore", - "cl" + // Lists + "lh", + "li", + "lf", + "lim", + // Deprecated + "ph", + "phi", + "ps", + "psi", }; public IEnumerable Parse(Stream stream) @@ -59,7 +82,7 @@ private IEnumerable ParseElement(XElement elem, ParseContext ctxt) ctxt.IsSentenceStart = true; continue; } - ctxt.ParaElement = e; + ctxt.ParentElement = e; foreach (UsxVerse evt in ParseElement(e, ctxt)) yield return evt; break; @@ -122,6 +145,19 @@ private IEnumerable ParseElement(XElement elem, ParseContext ctxt) if (ctxt.IsInVerse) ctxt.AddToken("", e); break; + case "table": + foreach (UsxVerse evt in ParseElement(e, ctxt)) + yield return evt; + break; + case "row": + foreach (UsxVerse evt in ParseElement(e, ctxt)) + yield return evt; + break; + case "cell": + ctxt.ParentElement = e; + foreach (UsxVerse evt in ParseElement(e, ctxt)) + yield return evt; + break; } break; @@ -133,24 +169,12 @@ private IEnumerable ParseElement(XElement elem, ParseContext ctxt) } } - private static bool IsVersePara(XElement paraElem) - { - var style = (string)paraElem.Attribute("style"); - if (NonVerseParaStyles.Contains(style)) - return false; - - if (IsNumberedStyle("ms", style)) - return false; - - if (IsNumberedStyle("s", style)) - return false; - - return true; - } - - private static bool IsNumberedStyle(string stylePrefix, string style) + private static bool IsVersePara(XElement parentElement) { - return style.StartsWith(stylePrefix) && int.TryParse(style.Substring(stylePrefix.Length), out _); + string style = (string)parentElement.Attribute("style"); + // strip any digits to the right of the style name using regular expression + style = Regex.Replace(style, @"\d+$", ""); + return VerseParaStyles.Contains(style); } private class ParseContext @@ -161,11 +185,11 @@ private class ParseContext public string Verse { get; set; } public bool IsInVerse => Chapter != null && Verse != null; public bool IsSentenceStart { get; set; } = true; - public XElement ParaElement { get; set; } + public XElement ParentElement { get; set; } public void AddToken(string text, XElement elem = null) { - _tokens.Add(new UsxToken(ParaElement, text, elem)); + _tokens.Add(new UsxToken(ParentElement, text, elem)); } public UsxVerse CreateVerse() diff --git a/tests/SIL.Machine.Tests/Corpora/UsxMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsxMemoryTextTests.cs new file mode 100644 index 00000000..426c444c --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/UsxMemoryTextTests.cs @@ -0,0 +1,69 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class UsxMemoryTextTests +{ + [Test] + public void TestGetRowsDescriptiveTitle() + { + IList rows = GetRows( + """ + +- Test + + +Descriptive title + +The rest of verse one. +This is verse two. + +""" + ); + + Assert.That(rows.Count, Is.EqualTo(2)); + + Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1"))); + Assert.That(rows[0].Text, Is.EqualTo("Descriptive title")); + } + + [Test] + public void TestGetRowsTable() + { + IList rows = GetRows( + """ + + - Test + + + + Chapter + 1 + verse + 1 + + + + Chapter 1 verse 2 + +
+
+""" + ); + + Assert.That(rows.Count, Is.EqualTo(2)); + + Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1"))); + Assert.That(rows[0].Text, Is.EqualTo("Chapter 1 verse 1")); + + Assert.That(rows[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2"))); + Assert.That(rows[1].Text, Is.EqualTo("Chapter 1 verse 2")); + } + + private static List GetRows(string usx) + { + var text = new UsxMemoryText("MAT", usx); + return text.GetRows().ToList(); + } +}