Skip to content

Commit

Permalink
Refactor to avoid archive difficulties in corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Aug 22, 2024
1 parent 0f2ecff commit cfc0150
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 79 deletions.
38 changes: 19 additions & 19 deletions src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;

namespace SIL.Machine.Corpora
{
public class ParatextBackupTermsCorpus : ParatextTermsCorpusBase
public class ParatextBackupTermsCorpus : DictionaryTextCorpus
{
private readonly ZipArchive _archive;

public ParatextBackupTermsCorpus(
ZipArchive archive,
string fileName,
IEnumerable<string> termCategories,
bool useTermGlosses = true
)
{
_archive = archive;
AddTexts(new ZipParatextProjectSettingsParser(archive).Parse(), termCategories, useTermGlosses);
}

protected override bool Exists(string fileName)
{
return _archive.GetEntry(fileName) != null;
}
using (var archive = ZipFile.OpenRead(fileName))
{
ParatextProjectSettings settings = new ZipParatextProjectSettingsParser(archive).Parse();
IEnumerable<(string, IEnumerable<string>)> glosses = new ZipParatextTermsParser(archive).Parse(
settings,
termCategories,
useTermGlosses
);
string textId =
$"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}";

protected override Stream Open(string fileName)
{
ZipArchiveEntry entry = _archive.GetEntry(fileName);
if (entry == null)
return null;
return entry.Open();
IText text = new MemoryText(
textId,
glosses.Select(kvp => new TextRow(textId, kvp.Item1) { Segment = kvp.Item2.ToList() })
);
AddText(text);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

namespace SIL.Machine.Corpora
{
public abstract class ParatextTermsCorpusBase : DictionaryTextCorpus
public abstract class ParatextTermsParserBase
{
private static readonly List<string> PredefinedTermsListTypes = new List<string>()
{
Expand All @@ -34,7 +34,7 @@ public abstract class ParatextTermsCorpusBase : DictionaryTextCorpus
private static readonly Regex ContentInBracketsRegex = new Regex(@"^\[(.+?)\]$", RegexOptions.Compiled);
private static readonly Regex NumericalInformationRegex = new Regex(@"\s+\d+(\.\d+)*$", RegexOptions.Compiled);

protected void AddTexts(
public IEnumerable<(string, IEnumerable<string>)> Parse(
ParatextProjectSettings settings,
IEnumerable<string> termCategories,
bool useTermGlosses = true
Expand Down Expand Up @@ -144,7 +144,12 @@ protected void AddTexts(
.ToDictionary(kvp => kvp.Item1, kvp => kvp.Item2);
}
if (termsGlosses.Count > 0 || termsRenderings.Count > 0)
AddTerms(termsRenderings, termsGlosses, settings);
{
return termsRenderings
.Concat(termsGlosses.Where(kvp => !termsRenderings.ContainsKey(kvp.Key)))
.Select(kvp => (kvp.Key, kvp.Value));
}
return new List<(string, IEnumerable<string>)>();
}

private static bool IsInCategory(
Expand All @@ -158,26 +163,6 @@ IDictionary<string, string> termIdToCategoryDictionary
|| (termIdToCategoryDictionary.TryGetValue(id, out category) && termCategories.Contains(category));
}

private void AddTerms(
IDictionary<string, IEnumerable<string>> termsRenderings,
IDictionary<string, IEnumerable<string>> termsGlosses,
ParatextProjectSettings settings
)
{
string textId =
$"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}";

//Prefer renderings to gloss localizations
IDictionary<string, IEnumerable<string>> glosses = termsRenderings
.Concat(termsGlosses.Where(kvp => !termsRenderings.ContainsKey(kvp.Key)))
.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
IText text = new MemoryText(
textId,
glosses.Select(kvp => new TextRow(textId, kvp.Key) { Segment = kvp.Value.ToList() })
);
AddText(text);
}

public static IReadOnlyList<string> GetGlosses(string gloss)
{
//If entire term rendering is surrounded in square brackets, remove them
Expand Down
28 changes: 28 additions & 0 deletions src/SIL.Machine/Corpora/ZipParatextTermsParser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
using System.IO;
using System.IO.Compression;

namespace SIL.Machine.Corpora
{
public class ZipParatextTermsParser : ParatextTermsParserBase
{
private readonly ZipArchive _archive;

public ZipParatextTermsParser(ZipArchive archive)
{
_archive = archive;
}

protected override bool Exists(string fileName)
{
return _archive.GetEntry(fileName) != null;
}

protected override Stream Open(string fileName)
{
ZipArchiveEntry entry = _archive.GetEntry(fileName);
if (entry == null)
return null;
return entry.Open();
}
}
}

This file was deleted.

20 changes: 20 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/MemoryProjectTermsParser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
using System.Text;

namespace SIL.Machine.Corpora;

public class MemoryParatextTermsParser(IDictionary<string, string> files) : ParatextTermsParserBase
{
public IDictionary<string, string> Files { get; } = files;

protected override bool Exists(string fileName)
{
return Files.ContainsKey(fileName);
}

protected override Stream? Open(string fileName)
{
if (!Files.TryGetValue(fileName, out string? contents))
return null;
return new MemoryStream(Encoding.UTF8.GetBytes(contents));
}
}
26 changes: 26 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsCorpus.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
namespace SIL.Machine.Corpora;

public class ParatextProjectTermsCorpus : DictionaryTextCorpus
{
public ParatextProjectTermsCorpus(
IDictionary<string, string> files,
ParatextProjectSettings settings,
IEnumerable<string> termCategories,
bool useTermGlosses = true
)
{
IEnumerable<(string, IEnumerable<string>)> glosses = new MemoryParatextTermsParser(files).Parse(
settings,
termCategories,
useTermGlosses
);
string textId =
$"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}";

IText text = new MemoryText(
textId,
glosses.Select(kvp => new TextRow(textId, kvp.Item1) { Segment = kvp.Item2.ToList() })
);
AddText(text);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
namespace SIL.Machine.Corpora;

[TestFixture]
public class ParatextProjectTermsCorpusTests
public class ParatextTermsCorpusTests
{
[Test]
public void TestGetKeyTermsFromTermsRenderings()
Expand Down Expand Up @@ -144,7 +144,7 @@ public void TestGetKeyTermsFromTermsLocalizations_TermRenderingsExists_PreferLoc
public void TestStripParens(string testString, string expectedOutput, char left = '(', char right = ')')
{
Assert.That(
ParatextTermsCorpusBase.StripParens(testString, left: left, right: right),
ParatextTermsParserBase.StripParens(testString, left: left, right: right),
Is.EqualTo(expectedOutput)
);
}
Expand All @@ -159,7 +159,7 @@ public void TestStripParens(string testString, string expectedOutput, char left
[TestCase("Ahasuerus, Xerxes; Assuerus", new string[] { "Ahasuerus", "Xerxes", "Assuerus" })]
public void TestGetGlosses(string glossString, IReadOnlyList<string> expectedOutput)
{
Assert.That(ParatextTermsCorpusBase.GetGlosses(glossString), Is.EqualTo(expectedOutput));
Assert.That(ParatextTermsParserBase.GetGlosses(glossString), Is.EqualTo(expectedOutput));
}

private class TestEnvironment(
Expand All @@ -168,11 +168,11 @@ private class TestEnvironment(
bool useTermGlosses = true
)
{
public MemoryParatextProjectTermsCorpus Corpus { get; } =
new MemoryParatextProjectTermsCorpus(
public ParatextProjectTermsCorpus Corpus { get; } =
new ParatextProjectTermsCorpus(
files ?? new(),
settings ?? new DefaultParatextProjectSettings(),
new string[] { "PN" },
files ?? new(),
useTermGlosses
);
}
Expand Down

0 comments on commit cfc0150

Please sign in to comment.