diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py index 79e2f2ea..1b3a666c 100644 --- a/se/se_epub_lint.py +++ b/se/se_epub_lint.py @@ -3614,9 +3614,14 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N # Check and log missing glossary keys if ebook_flags["has_glossary_search_key_map"] and filename.name not in IGNORED_FILENAMES: - source_text = dom.xpath("/html/body")[0].inner_text() + # Remove all noterefs, as their anchor text will otherwise immediately follow a potential glossary term, defeating the below regex. + dom_copy = deepcopy(dom) + for node in dom_copy.xpath(".//a[contains(@epub:type, 'noteref')]"): + node.remove() + + source_text = dom_copy.xpath("/html/body")[0].inner_text() if dom.xpath("/html/body//section[contains(@epub:type, 'glossary')]"): - nodes = dom.xpath("/html/body//dd[contains(@epub:type, 'glossdef')]") + nodes = dom_copy.xpath("/html/body//dd[contains(@epub:type, 'glossdef')]") source_text = " ".join([node.inner_text() for node in nodes]) for glossary_index, glossary_value in enumerate(glossary_usage): if glossary_value[1] is False and regex.search(r"(?(?!\w)", source_text, flags=regex.IGNORECASE, val=[glossary_value[0]]): diff --git a/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml index d50708e4..266070a0 100644 --- a/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml +++ b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml @@ -12,4 +12,7 @@ + + + diff --git a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml index f290d151..8a99425f 100644 --- a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml +++ b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml @@ -12,6 +12,7 @@

A common theory was R+L=J.

A ’versal truth.

An unknown M.O.

+

Unsiker1 is an unusual term.