Skip to content

Commit

Permalink
Add USX table and fix incorrect handling of descriptive titles
Browse files Browse the repository at this point in the history
  • Loading branch information
johnml1135 committed Nov 6, 2024
1 parent 7f2af4e commit bef7d43
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 37 deletions.
6 changes: 3 additions & 3 deletions src/SIL.Machine/Corpora/UsxFileAlignmentCollection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -182,19 +182,19 @@ private static Dictionary<string, HashSet<int>> GetLinks(
IReadOnlyList<UsxToken> tokens
)
{
XElement prevParaElem = null;
XElement prevParentElement = null;
var sb = new StringBuilder();
var linkStrs = new List<(Range<int>, string)>();
foreach (UsxToken token in tokens)
{
if (token.ParaElement != prevParaElem && sb.Length > 0)
if (token.ParentElement != prevParentElement && sb.Length > 0)
sb.Append(" ");

int start = sb.Length;
sb.Append(token);
if (token.Element is XElement e && e.Name == "wg")
linkStrs.Add((Range<int>.Create(start, sb.Length), (string)e.Attribute("target_links")));
prevParaElem = token.ParaElement;
prevParentElement = token.ParentElement;
}
string text = sb.ToString().Trim();

Expand Down
20 changes: 20 additions & 0 deletions src/SIL.Machine/Corpora/UsxMemoryText.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
public class UsxMemoryText : UsxTextBase
{
private readonly string _usx;

public UsxMemoryText(string id, string usx, ScrVers versification = null)
: base(id, versification)
{
_usx = usx;
}

protected override IStreamContainer CreateStreamContainer()
{
return new MemoryStreamContainer(_usx);
}
}
}
6 changes: 3 additions & 3 deletions src/SIL.Machine/Corpora/UsxToken.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ namespace SIL.Machine.Corpora
{
public class UsxToken
{
public UsxToken(XElement paraElem, string text, XElement elem = null)
public UsxToken(XElement parentElement, string text, XElement elem = null)
{
ParaElement = paraElem;
ParentElement = parentElement;
Text = text;
Element = elem;
}

public XElement ParaElement { get; }
public XElement ParentElement { get; }
public string Text { get; }
public XElement Element { get; }

Expand Down
9 changes: 8 additions & 1 deletion src/SIL.Machine/Corpora/UsxVerse.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,15 @@ public UsxVerse(string chapter, string verse, bool isSentenceStart, IEnumerable<
if (token.Text.Length == 0 || token.Text.StartsWith("\n"))
continue;

if (prevToken != null && token.ParaElement != prevToken.ParaElement && sb.Length > 0 && !endsWithSpace)
if (
prevToken != null
&& token.ParentElement != prevToken.ParentElement
&& sb.Length > 0
&& !endsWithSpace
)
{
sb.Append(" ");
}

sb.Append(token);
endsWithSpace = token.Text.EndsWith(" ");
Expand Down
84 changes: 54 additions & 30 deletions src/SIL.Machine/Corpora/UsxVerseParser.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using SIL.Machine.Utils;
using SIL.Scripture;
Expand All @@ -9,18 +10,40 @@ namespace SIL.Machine.Corpora
{
public class UsxVerseParser
{
private static readonly HashSet<string> NonVerseParaStyles = new HashSet<string>
private static readonly HashSet<string> VerseParaStyles = new HashSet<string>
{
"ms",
"mr",
"s",
"sr",
"r",
// Paragraphs
"p",
"m",
"po",
"pr",
"cls",
"pmo",
"pm",
"pmc",
"pmr",
"pi",
"pc",
"mi",
"nb",
// Poetry
"q",
"qc",
"qr",
"qm",
"qd",
"b",
"d",
"sp",
"rem",
"restore",
"cl"
// Lists
"lh",
"li",
"lf",
"lim",
// Deprecated
"ph",
"phi",
"ps",
"psi",
};

public IEnumerable<UsxVerse> Parse(Stream stream)
Expand Down Expand Up @@ -59,7 +82,7 @@ private IEnumerable<UsxVerse> ParseElement(XElement elem, ParseContext ctxt)
ctxt.IsSentenceStart = true;
continue;
}
ctxt.ParaElement = e;
ctxt.ParentElement = e;
foreach (UsxVerse evt in ParseElement(e, ctxt))
yield return evt;
break;
Expand Down Expand Up @@ -122,6 +145,19 @@ private IEnumerable<UsxVerse> ParseElement(XElement elem, ParseContext ctxt)
if (ctxt.IsInVerse)
ctxt.AddToken("", e);
break;
case "table":
foreach (UsxVerse evt in ParseElement(e, ctxt))
yield return evt;
break;
case "row":
foreach (UsxVerse evt in ParseElement(e, ctxt))
yield return evt;
break;
case "cell":
ctxt.ParentElement = e;
foreach (UsxVerse evt in ParseElement(e, ctxt))
yield return evt;
break;
}
break;

Expand All @@ -133,24 +169,12 @@ private IEnumerable<UsxVerse> ParseElement(XElement elem, ParseContext ctxt)
}
}

private static bool IsVersePara(XElement paraElem)
{
var style = (string)paraElem.Attribute("style");
if (NonVerseParaStyles.Contains(style))
return false;

if (IsNumberedStyle("ms", style))
return false;

if (IsNumberedStyle("s", style))
return false;

return true;
}

private static bool IsNumberedStyle(string stylePrefix, string style)
private static bool IsVersePara(XElement parentElement)
{
return style.StartsWith(stylePrefix) && int.TryParse(style.Substring(stylePrefix.Length), out _);
string style = (string)parentElement.Attribute("style");
// strip any digits to the right of the style name using regular expression
style = Regex.Replace(style, @"\d+$", "");
return VerseParaStyles.Contains(style);
}

private class ParseContext
Expand All @@ -161,11 +185,11 @@ private class ParseContext
public string Verse { get; set; }
public bool IsInVerse => Chapter != null && Verse != null;
public bool IsSentenceStart { get; set; } = true;
public XElement ParaElement { get; set; }
public XElement ParentElement { get; set; }

public void AddToken(string text, XElement elem = null)
{
_tokens.Add(new UsxToken(ParaElement, text, elem));
_tokens.Add(new UsxToken(ParentElement, text, elem));
}

public UsxVerse CreateVerse()
Expand Down
69 changes: 69 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/UsxMemoryTextTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
using NUnit.Framework;

namespace SIL.Machine.Corpora;

[TestFixture]
public class UsxMemoryTextTests
{
[Test]
public void TestGetRowsDescriptiveTitle()
{
IList<TextRow> rows = GetRows(
"""
<usx version="3.0">
<book code="MAT" style="id">- Test</book>
<chapter number="1" style="c" />
<para style="d">
<verse number="1" style="v" sid="MAT 1:1" />Descriptive title</para>
<para style="p">
The rest of verse one.<verse eid="MAT 1:1" />
<verse number="2" style="v" />This is verse two.</para>
</usx>
"""
);

Assert.That(rows.Count, Is.EqualTo(2));

Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1")));
Assert.That(rows[0].Text, Is.EqualTo("Descriptive title"));
}

[Test]
public void TestGetRowsTable()
{
IList<TextRow> rows = GetRows(
"""
<usx version="3.0">
<book code="MAT" style="id">- Test</book>
<chapter number="1" style="c" />
<table>
<row style="tr">
<cell style="tc1" align="start"><verse number="1" style="v" />Chapter</cell>
<cell style="tcr2" align="end">1</cell>
<cell style="tc3" align="start">verse</cell>
<cell style="tcr4" align="end">1</cell>
</row>
<row style="tr">
<cell style="tc1" colspan="2" align="start"><verse number="2" style="v" /></cell>
<cell style="tc3" colspan="2" align="start">Chapter 1 verse 2</cell>
</row>
</table>
</usx>
"""
);

Assert.That(rows.Count, Is.EqualTo(2));

Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1")));
Assert.That(rows[0].Text, Is.EqualTo("Chapter 1 verse 1"));

Assert.That(rows[1].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2")));
Assert.That(rows[1].Text, Is.EqualTo("Chapter 1 verse 2"));
}

private static List<TextRow> GetRows(string usx)
{
var text = new UsxMemoryText("MAT", usx);
return text.GetRows().ToList();
}
}

0 comments on commit bef7d43

Please sign in to comment.