Skip to content

Commit

Permalink
Add parameter for filtering key terms by book/chapters
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 authored and johnml1135 committed Oct 11, 2024
1 parent 3616630 commit ff75f8d
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 6 deletions.
5 changes: 3 additions & 2 deletions src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ public class ParatextBackupTermsCorpus : DictionaryTextCorpus
public ParatextBackupTermsCorpus(
string fileName,
IEnumerable<string> termCategories,
bool useTermGlosses = true
bool useTermGlosses = true,
IDictionary<string, HashSet<int>> chapters = null
)
{
using (var archive = ZipFile.OpenRead(fileName))
Expand All @@ -18,7 +19,7 @@ public ParatextBackupTermsCorpus(
IEnumerable<(string, IReadOnlyList<string>)> glosses = new ZipParatextProjectTermsParser(
archive,
settings
).Parse(termCategories, useTermGlosses);
).Parse(termCategories, useTermGlosses, chapters);
string textId =
$"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}";

Expand Down
46 changes: 45 additions & 1 deletion src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System.Text.RegularExpressions;
using System.Xml.Linq;
using SIL.Extensions;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
Expand Down Expand Up @@ -49,11 +50,13 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti

public IEnumerable<(string TermId, IReadOnlyList<string> Glosses)> Parse(
IEnumerable<string> termCategories,
bool useTermGlosses = true
bool useTermGlosses = true,
IDictionary<string, HashSet<int>> chapters = null
)
{
XDocument biblicalTermsDoc;
IDictionary<string, string> termIdToCategoryDictionary;
IDictionary<string, ImmutableHashSet<VerseRef>> termIdToReferences;
if (_settings.BiblicalTermsListType == "Project")
{
if (Exists(_settings.BiblicalTermsFileName))
Expand All @@ -62,6 +65,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
{
biblicalTermsDoc = XDocument.Load(keyTermsFile);
termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc);
termIdToReferences = GetReferences(biblicalTermsDoc);
}
}
else
Expand All @@ -74,6 +78,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
{
biblicalTermsDoc = XDocument.Load(keyTermsFile);
termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc);
termIdToReferences = GetReferences(biblicalTermsDoc);
}
}
}
Expand All @@ -87,11 +92,13 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
{
biblicalTermsDoc = XDocument.Load(keyTermsFile);
termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc);
termIdToReferences = GetReferences(biblicalTermsDoc);
}
}
else
{
termIdToCategoryDictionary = new Dictionary<string, string>();
termIdToReferences = new Dictionary<string, ImmutableHashSet<VerseRef>>();
}

XDocument termsGlossesDoc = null;
Expand Down Expand Up @@ -124,6 +131,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
.Where(n => n.Name.LocalName == "TermRendering")
.Select(ele => (ele.Attribute("Id").Value, ele))
.Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategoryDictionary))
.Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences))
.Select(kvp =>
{
string id = kvp.Item1.Replace("\n", "&#xA");
Expand All @@ -144,6 +152,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
.Where(n => n.Name.LocalName == "Localization")
.Select(ele => (ele.Attribute("Id").Value, ele))
.Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategoryDictionary))
.Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences))
.Select(kvp =>
{
string id = kvp.Item1.Replace("\n", "&#xA");
Expand Down Expand Up @@ -175,6 +184,24 @@ IDictionary<string, string> termIdToCategoryDictionary
|| (termIdToCategoryDictionary.TryGetValue(id, out category) && termCategories.Contains(category));
}

private static bool IsInChapters(
string id,
IDictionary<string, HashSet<int>> chapters,
IDictionary<string, ImmutableHashSet<VerseRef>> termIdToReferences
)
{
ImmutableHashSet<VerseRef> verseRefs;
return termIdToReferences.Count() == 0
|| chapters == null
|| (
termIdToReferences.TryGetValue(id, out verseRefs)
&& verseRefs.Any(vr =>
chapters.TryGetValue(vr.Book, out HashSet<int> bookChapters)
&& (bookChapters.Count() == 0 || bookChapters.Contains(vr.ChapterNum))
)
);
}

public static IReadOnlyList<string> GetGlosses(string gloss)
{
//If entire term rendering is surrounded in square brackets, remove them
Expand Down Expand Up @@ -243,6 +270,23 @@ private static IDictionary<string, string> GetCategoryPerId(XDocument biblicalTe
.ToDictionary(e => e.Attribute("Id").Value, e => e.Element("Category")?.Value ?? "");
}

private static IDictionary<string, ImmutableHashSet<VerseRef>> GetReferences(XDocument biblicalTermsDocument)
{
return biblicalTermsDocument
.Descendants()
.Where(n => n.Name.LocalName == "Term")
.DistinctBy(e => e.Attribute("Id").Value)
.ToDictionary(
e => e.Attribute("Id").Value,
e =>
e.Element("References")
?.Descendants()
.Where(reference => int.TryParse(reference.Value.Substring(0, 9), out int _))
.Select(reference => new VerseRef(int.Parse(reference.Value.Substring(0, 9))))
.ToImmutableHashSet()
);
}

protected abstract Stream Open(string fileName);

protected abstract bool Exists(string fileName);
Expand Down
32 changes: 29 additions & 3 deletions tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public void TestGetKeyTermsFromTermsLocalizations_NoTermRenderings_DoNotUseTermG
}

[Test]
public void TestGetKeyTermsFromTermsLocalizations_()
public void TestGetKeyTermsFromTermsLocalizations()
{
var env = new TestEnvironment(
new DefaultParatextProjectSettings(
Expand All @@ -88,6 +88,29 @@ public void TestGetKeyTermsFromTermsLocalizations_()
Assert.That(string.Join(" ", terms.First().Glosses), Is.EqualTo("Aaron"));
}

[Test]
public void TestGetKeyTermsFromTermsLocalizations_FilterByChapters()
{
var env = new TestEnvironment(
new DefaultParatextProjectSettings(
biblicalTermsListType: "Major",
biblicalTermsFileName: "BiblicalTerms.xml",
languageCode: "fr"
),
useTermGlosses: true,
chapters: new Dictionary<string, HashSet<int>>()
{
{
"HAB",
new() { 1 }
}
}
);
IEnumerable<(string TermId, IReadOnlyList<string> Glosses)> terms = env.GetGlosses();
Assert.That(terms.Count, Is.EqualTo(3)); //Habakkuk, YHWH, Kashdi/Chaldean are the only PN terms in HAB 1
Assert.That(string.Join(" ", terms.First().Glosses), Is.EqualTo("Habaquq"));
}

[Test]
public void TestGetKeyTermsFromTermsLocalizations_TermRenderingsExists_PreferLocalization()
{
Expand Down Expand Up @@ -150,16 +173,19 @@ public void TestGetGlosses(string glossString, IReadOnlyList<string> expectedOut
private class TestEnvironment(
ParatextProjectSettings? settings = null,
Dictionary<string, string>? files = null,
bool useTermGlosses = true
bool useTermGlosses = true,
IDictionary<string, HashSet<int>>? chapters = null
)
{
private readonly bool _useTermGlosses = useTermGlosses;
private readonly IDictionary<string, HashSet<int>>? _chapters = chapters;

public ParatextProjectTermsParserBase Parser { get; } =
new MemoryParatextProjectTermsParser(settings ?? new DefaultParatextProjectSettings(), files ?? new());

public IEnumerable<(string TermId, IReadOnlyList<string> Glosses)> GetGlosses()
{
return Parser.Parse(new string[] { "PN" }, _useTermGlosses);
return Parser.Parse(new string[] { "PN" }, _useTermGlosses, _chapters);
}
}

Expand Down

0 comments on commit ff75f8d

Please sign in to comment.