Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up regex flags #735

Merged
merged 1 commit into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions se/commands/create_draft.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,8 +727,8 @@ def _create_draft(args: Namespace, plain_output: bool):

producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", producers_text, flags=regex.DOTALL)
producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL)
producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL)
producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL)
producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text)
producers_text = regex.sub(r"[\r\n]+", " ", producers_text)
producers_text = regex.sub(r",? and ", ", and ", producers_text)
producers_text = producers_text.replace(" and the Online", " and The Online")
producers_text = producers_text.replace(", and ", ", ").strip()
Expand Down Expand Up @@ -945,7 +945,7 @@ def _create_draft(args: Namespace, plain_output: bool):

i = i + 1

metadata_xml = regex.sub(r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL)
metadata_xml = regex.sub(r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url\.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml)

if ebook_wiki_url:
metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<")
Expand Down
2 changes: 1 addition & 1 deletion se/commands/word_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def word_count(plain_output: bool) -> int:

else:
# We couldn't generate a dom, fall back to regex replacements
xhtml = regex.sub(r"<(pre|div|p)[^>]*?>[^<]*Project Gutenberg[^<]+?</\1>", "", xhtml, flags=regex.IGNORECASE|regex.DOTALL)
xhtml = regex.sub(r"<(pre|div|p)[^>]*?>[^<]*Project Gutenberg[^<]+?</\1>", "", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"<span class=\"pagenum\">.+?</span>", "", xhtml, flags=regex.IGNORECASE|regex.DOTALL)

total_word_count += se.formatting.get_word_count(xhtml)
Expand Down
18 changes: 9 additions & 9 deletions se/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,16 +362,16 @@ def get_word_count(xhtml: str) -> int:
xhtml = regex.sub(r"<.+?>", " ", xhtml, flags=regex.DOTALL)

# Replace some formatting characters
xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml)

# Remove word-connecting dashes, apostrophes, commas, and slashes (and/or), they count as a word boundry but they shouldn't
xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml)

# Replace sequential spaces with one space
xhtml = regex.sub(r"\s+", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
xhtml = regex.sub(r"\s+", " ", xhtml)

# Get the word count
return len(regex.findall(r"\b\w+\b", xhtml, flags=regex.IGNORECASE | regex.DOTALL))
return len(regex.findall(r"\b\w+\b", xhtml))

def _replace_character_references(match_object) -> str:
"""Replace most XML character references with literal characters.
Expand Down Expand Up @@ -660,13 +660,13 @@ def format_xhtml(xhtml: str) -> str:
xhtml = regex.sub(r"&#?\w+;", _replace_character_references, xhtml)

# Remove unnecessary doctypes which can cause xmllint to hang
xhtml = regex.sub(r"<!DOCTYPE[^>]+?>", "", xhtml, flags=regex.DOTALL)
xhtml = regex.sub(r"<!DOCTYPE[^>]+?>", "", xhtml)

# Remove white space between opening/closing tag and text nodes
# We do this first so that we can still format line breaks after <br/>
# Exclude comments
xhtml = regex.sub(r"(<(?:[^!/][^>]*?[^/]|[a-z])>)\s+([^\s<])", r"\1\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([^\s>])\s+(</[^>]+?>)", r"\1\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([^\s>])\s+(</[^>]+?>)", r"\1\2", xhtml)

try:
tree = _format_xml_str(xhtml)
Expand Down Expand Up @@ -1088,7 +1088,7 @@ def format_css(css: str) -> str:
output = regex.sub(r"(@[\p{Letter}]+) \(", "\\1(", output)

# Remove empty rules
output = regex.sub(r"^\t*[^\{\}]+?\{\s*\}\n", "", output, flags=regex.DOTALL|regex.MULTILINE)
output = regex.sub(r"^\t*[^\{\}]+?\{\s*\}\n", "", output, flags=regex.MULTILINE)

return output

Expand All @@ -1103,7 +1103,7 @@ def remove_tags(text: str) -> str:
A string with all HTML tags removed
"""

return regex.sub(r"</?[\p{Letter}]+[^>]*?>", "", text, flags=regex.DOTALL)
return regex.sub(r"</?[\p{Letter}]+[^>]*?>", "", text)

def get_ordinal(number: str) -> str:
"""
Expand Down Expand Up @@ -1296,7 +1296,7 @@ def make_url_safe(text: str) -> str:
text = regex.sub(r"['‘’`]", "", text)

# 5. Convert any non-digit, non-letter character to a space
text = regex.sub(r"[^0-9\p{Letter}]", " ", text, flags=regex.IGNORECASE)
text = regex.sub(r"[^0-9\p{Letter}]", " ", text)

# 6. Convert any instance of one or more space to a dash
text = regex.sub(r"\s+", "-", text)
Expand Down
2 changes: 1 addition & 1 deletion se/se_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ def recompose(self, output_xhtml5: bool, extra_css_file: Union[Path,None] = None
output_xhtml = output_xhtml.replace("epub|type", "data-epub-type")
output_xhtml = output_xhtml.replace("xml|lang", "lang")
output_xhtml = regex.sub(r" xmlns.+?=\".+?\"", "", output_xhtml)
output_xhtml = regex.sub(r"@namespace (epub|xml).+?\s+", "", output_xhtml, flags=regex.MULTILINE)
output_xhtml = regex.sub(r"@namespace (epub|xml).+?\s+", "", output_xhtml)

# The Nu HTML5 Validator barfs if non-void elements are self-closed (like <td/>)
# Try to un-self-close them for HTML5 output.
Expand Down
2 changes: 1 addition & 1 deletion se/se_epub_generate_toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def toc_link(self) -> str:
out_string += f"<a href=\"text/{self.file_link}\">{self.title}</a>\n"

# Replace <br/> with a single space
out_string = regex.sub(r"<br/>\s*", " ", out_string, flags=regex.DOTALL)
out_string = regex.sub(r"<br/>\s*", " ", out_string)

return out_string

Expand Down
14 changes: 7 additions & 7 deletions se/se_epub_lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1827,7 +1827,7 @@ def _lint_xhtml_syntax_checks(self, filename: Path, dom: se.easy_xml.EasyXmlTree
title = regex.sub(r"^[\s\.\,\!\?\:\;]*", "", title)

# Normalize whitespace
title = regex.sub(r"\s+", " ", title, flags=regex.DOTALL).strip()
title = regex.sub(r"\s+", " ", title).strip()

# Do we have a subtitle? If so the first letter of that must be capitalized, so we pull that out
subtitle_matches = regex.findall(r"(.*?)<span epub:type=\"subtitle\">(.*?)</span>(.*?)", title, flags=regex.DOTALL)
Expand Down Expand Up @@ -2313,7 +2313,7 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,

# Check for repeated punctuation, but first remove `&amp;` so we don't match `&amp;,`
# Remove tds with repeated ” as they are probably ditto marks
matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&amp;", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"<td>[”\s]+?(<a .+?epub:type=\"noteref\">.+?</a>)?</td>", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents, flags=regex.IGNORECASE)
matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&amp;", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"<td>[”\s]+?(<a .+?epub:type=\"noteref\">.+?</a>)?</td>", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents)
if matches:
messages.append(LintMessage("t-008", "Repeated punctuation.", se.MESSAGE_TYPE_WARNING, filename, matches))

Expand Down Expand Up @@ -2608,7 +2608,7 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,
messages.append(LintMessage("t-048", "Chapter opening text in all-caps.", se.MESSAGE_TYPE_ERROR, filename, [node.to_string() for node in nodes]))

# Check for two-em-dashes used for elision instead of three-em-dashes
matches = regex.findall(fr"[^{se.WORD_JOINER}\p{{Letter}}”]⸺[^“{se.WORD_JOINER}\p{{Letter}}].*", file_contents, flags=regex.MULTILINE)
matches = regex.findall(fr"[^{se.WORD_JOINER}\p{{Letter}}”]⸺[^“{se.WORD_JOINER}\p{{Letter}}].*", file_contents)
if matches:
messages.append(LintMessage("t-049", "Two-em-dash used for eliding an entire word. Use a three-em-dash instead.", se.MESSAGE_TYPE_WARNING, filename, matches))

Expand Down Expand Up @@ -2934,7 +2934,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
# Exclude paragraphs in blockquotes, which may have special quoting rules, and "continued" paragraphs, which may be continued dialog without an “
for node in dom_copy.xpath("/html/body//p[not(ancestor::blockquote) and not(contains(@class, 'continued'))]"):
node.set_attr("id", "lint-" + str(node_number))
temp_xhtml = temp_xhtml + f"<p id=\"lint-{node_number}\">" + regex.sub(r"[\s\n]+", " ", node.inner_text(), flags=regex.DOTALL) + "\n"
temp_xhtml = temp_xhtml + f"<p id=\"lint-{node_number}\">" + regex.sub(r"\s+", " ", node.inner_text()) + "\n"
node_number = node_number + 1

replacement_count = 1
Expand All @@ -2943,12 +2943,12 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
(temp_xhtml, replacement_count) = regex.subn(r"“[^“]+?”", " ", temp_xhtml) # Remove all regular quotes

# Remove contractions to reduce rsquo for next regex
temp_xhtml = regex.sub(r"[\p{Letter}]’[\p{Letter}]", " ", temp_xhtml, flags=regex.MULTILINE)
temp_xhtml = regex.sub(r"[\p{Letter}]’[\p{Letter}]", " ", temp_xhtml)

# Remove all runs of ldquo that are likely to spill to the next <p>
replacement_count = 1
while replacement_count > 0:
(temp_xhtml, replacement_count) = regex.subn(r"“[^“”]+?$", " ", temp_xhtml, flags=regex.MULTILINE)
(temp_xhtml, replacement_count) = regex.subn(r"“[^“”]+?$", " ", temp_xhtml)

# Match problem `‘` using regex, and if found, get the actual node text from the dom to return.
typos = []
Expand Down Expand Up @@ -3009,7 +3009,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c

# Check for closing rdquo without opening ldquo.
# Remove tds in case rdquo means "ditto mark"
typos = regex.findall(r"”[^“‘]+?”", regex.sub(r"<td[^>]*?>[”\s]+?(<a .+?epub:type=\"noteref\">.+?</a>)?</td>", "", file_contents), flags=regex.DOTALL)
typos = regex.findall(r"”[^“‘]+?”", regex.sub(r"<td[^>]*?>[”\s]+?(<a .+?epub:type=\"noteref\">.+?</a>)?</td>", "", file_contents))

# We create a filter to try to exclude nested quotations
# Remove tags in case they're enclosing punctuation we want to match against at the end of a sentence.
Expand Down
56 changes: 28 additions & 28 deletions se/typography.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,18 +103,18 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = xhtml.replace("——", "⸺")

# Smartypants doesn't do well on em dashes followed by open quotes. Fix that here
xhtml = regex.sub(r"—”([\p{Letter}])", r"—“\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"—’([\p{Letter}])", r"—‘\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"-“</p>", r"—”</p>", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"‘”</p>", fr"’{se.HAIR_SPACE}”</p>", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"—”([\p{Letter}])", r"—“\1", xhtml)
xhtml = regex.sub(r"—’([\p{Letter}])", r"—‘\1", xhtml)
xhtml = regex.sub(r"-“</p>", r"—”</p>", xhtml)
xhtml = regex.sub(r"‘”</p>", fr"’{se.HAIR_SPACE}”</p>", xhtml)

# Now that we've fixed Smartypants' output, put our quotes back in
xhtml = xhtml.replace("!#se:rsquo#!", "’")

# Remove spaces between en and em dashes
# Note that we match at least one character before the dashes, so that we don't catch start-of-line em dashes like in poetry.
# We do a negative lookbehind for <br/ to prevent newlines/indents after <br/>s from being included
xhtml = regex.sub(r"(?<!<br/)([^\.…\s])\s*([–—])\s*", r"\1\2", xhtml, flags=regex.DOTALL)
xhtml = regex.sub(r"(?<!<br/)([^\.…\s])\s*([–—])\s*", r"\1\2", xhtml)

# First, remove stray word joiners
xhtml = xhtml.replace(se.WORD_JOINER, "")
Expand All @@ -123,12 +123,12 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = xhtml.replace(se.SHY_HYPHEN, "")

# Fix some common em-dash transcription errors
xhtml = regex.sub(r"([:;])-([\p{Letter}])", r"\1—\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([\p{Letter}])-“", r"\1—“", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([:;])-([\p{Letter}])", r"\1—\2", xhtml)
xhtml = regex.sub(r"([\p{Letter}])-“", r"\1—“", xhtml)
xhtml = regex.sub(r":-</", fr":{se.WORD_JOINER}—</", xhtml)

# Em dashes and two-em-dashes can be broken before, so add a word joiner between letters/punctuation and the following em dash
xhtml = regex.sub(fr"([^\s{se.WORD_JOINER}{se.NO_BREAK_SPACE}{se.HAIR_SPACE}])([—⸻])", fr"\1{se.WORD_JOINER}\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"([^\s{se.WORD_JOINER}{se.NO_BREAK_SPACE}{se.HAIR_SPACE}])([—⸻])", fr"\1{se.WORD_JOINER}\2", xhtml)

# Add en dashes; don't replace match that is within an html tag, since ids and attrs often contain the pattern DIGIT-DIGIT
xhtml = regex.sub(r"(?<!<[^>]*)([0-9]+)\-([0-9]+)", r"\1–\2", xhtml)
Expand All @@ -146,7 +146,7 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = regex.sub(fr"([\p{{Lowercase_Letter}}]){se.WORD_JOINER}—th\b", r"\1 —th", xhtml)

# Remove word joiners from following opening tags--they're usually never correct
xhtml = regex.sub(fr"<([\p{{Letter}}]+)([^>]*?)>{se.WORD_JOINER}", r"<\1\2>", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"<([\p{{Letter}}]+)([^>]*?)>{se.WORD_JOINER}", r"<\1\2>", xhtml)

# Add a word joiner after em dashes within <cite> elements
xhtml = regex.sub(r"<cite([^>]*?)>—", fr"<cite\1>—{se.WORD_JOINER}", xhtml)
Expand Down Expand Up @@ -213,7 +213,7 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = regex.sub(r"(\s)‘a’(\s)", r"\1’a’\2", xhtml, flags=regex.IGNORECASE)

# Years
xhtml = regex.sub(r"‘([0-9]{2,}[^\p{Letter}0-9’])", r"’\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"‘([0-9]{2,}[^\p{Letter}0-9’])", r"’\1", xhtml)

xhtml = regex.sub(r"‘([Aa]ve|[Oo]me|[Ii]m|[Mm]idst|[Gg]ainst|[Nn]eath|[Ee]m|[Cc]os|[Tt]is|[Tt]isn’t|[Tt]was|[Tt]ain’t|[Tt]wixt|[Tt]were|[Tt]would|[Tt]wouldn|[Tt]won|[Tt]ween|[Tt]will|[Rr]ound|[Pp]on|[Uu]ns?|[Uu]d|[Cc]ept|[Oo]w|[Aa]ppen|[Ee])\b", r"’\1", xhtml)

Expand Down Expand Up @@ -245,34 +245,34 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = regex.sub(r"(?<!A\. )B\.\s+C\.", r"BC", xhtml)

# Put spacing next to close quotes
xhtml = regex.sub(fr"“[\s{se.NO_BREAK_SPACE}]*‘", fr"“{se.HAIR_SPACE}‘", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"’[\s{se.NO_BREAK_SPACE}]*”", fr"’{se.HAIR_SPACE}”", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"“[\s{se.NO_BREAK_SPACE}]*’", fr"“{se.HAIR_SPACE}’", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"‘[\s{se.NO_BREAK_SPACE}]*“", fr"‘{se.HAIR_SPACE}“", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"‘[\s{se.NO_BREAK_SPACE}]*’", fr"‘{se.HAIR_SPACE}’", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"“[\s{se.NO_BREAK_SPACE}]*‘", fr"“{se.HAIR_SPACE}‘", xhtml)
xhtml = regex.sub(fr"’[\s{se.NO_BREAK_SPACE}]*”", fr"’{se.HAIR_SPACE}”", xhtml)
xhtml = regex.sub(fr"“[\s{se.NO_BREAK_SPACE}]*’", fr"“{se.HAIR_SPACE}’", xhtml)
xhtml = regex.sub(fr"‘[\s{se.NO_BREAK_SPACE}]*“", fr"‘{se.HAIR_SPACE}“", xhtml)
xhtml = regex.sub(fr"‘[\s{se.NO_BREAK_SPACE}]*’", fr"‘{se.HAIR_SPACE}’", xhtml)

# We require a non-letter char at the end, otherwise we might match a contraction: “Hello,” ’e said.
xhtml = regex.sub(fr"”[\s{se.NO_BREAK_SPACE}]*’([^\p{{Letter}}])", fr"”{se.HAIR_SPACE}’\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"”[\s{se.NO_BREAK_SPACE}]*’([^\p{{Letter}}])", fr"”{se.HAIR_SPACE}’\1", xhtml)

# Fix ellipses spacing
xhtml = regex.sub(r"\s*\.\s*\.\s*\.\s*", r"…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?\.", fr".{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?", fr"{se.HAIR_SPACE}… ", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"<p([^>]*?)>{se.HAIR_SPACE}…", r"<p\1>…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"\s*\.\s*\.\s*\.\s*", r"…", xhtml)
xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?\.", fr".{se.HAIR_SPACE}…", xhtml)
xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?", fr"{se.HAIR_SPACE}… ", xhtml)
xhtml = regex.sub(fr"<p([^>]*?)>{se.HAIR_SPACE}…", r"<p\1>…", xhtml)

# Remove spaces between opening tags and ellipses
xhtml = regex.sub(fr"(<[\p{{Letter}}0-9]+[^<]+?>)[\s{se.NO_BREAK_SPACE}]+?…", r"\1…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"(<[\p{{Letter}}0-9]+[^<]+?>)[\s{se.NO_BREAK_SPACE}]+?…", r"\1…", xhtml)

# Remove spaces between closing tags and ellipses
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?(</[\p{{Letter}}0-9]+>)", r"…\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]+([\)”’])(?![\p{{Letter}}])", r"…\1", xhtml, flags=regex.IGNORECASE) # If followed by a letter, the single quote is probably a leading elision
xhtml = regex.sub(fr"([\(“‘])[\s{se.NO_BREAK_SPACE}]+…", r"\1…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\!\?\.\;\,])", fr"…{se.HAIR_SPACE}\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"([\!\?\.\;”’])[\s{se.NO_BREAK_SPACE}]?…", fr"\1{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"\,[\s{se.NO_BREAK_SPACE}]?…", fr",{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?(</[\p{{Letter}}0-9]+>)", r"…\1", xhtml)
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]+([\)”’])(?![\p{{Letter}}])", r"…\1", xhtml) # If followed by a letter, the single quote is probably a leading elision
xhtml = regex.sub(fr"([\(“‘])[\s{se.NO_BREAK_SPACE}]+…", r"\1…", xhtml)
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\!\?\.\;\,])", fr"…{se.HAIR_SPACE}\1", xhtml)
xhtml = regex.sub(fr"([\!\?\.\;”’])[\s{se.NO_BREAK_SPACE}]?…", fr"\1{se.HAIR_SPACE}…", xhtml)
xhtml = regex.sub(fr"\,[\s{se.NO_BREAK_SPACE}]?…", fr",{se.HAIR_SPACE}…", xhtml)

# Add nbsp to ellipses that open dialog
xhtml = regex.sub(r"([“‘])…\s([\p{Letter}0-9])", fr"\1…{se.NO_BREAK_SPACE}\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([“‘])…\s([\p{Letter}0-9])", fr"\1…{se.NO_BREAK_SPACE}\2", xhtml)

# Don't use . ... if within a clause
xhtml = regex.sub(r"\.(\s…\s[\p{Lowercase_Letter}])", r"\1", xhtml)
Expand Down
2 changes: 1 addition & 1 deletion se/vendor/kobo_touch_extended/kobo.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def append_kobo_spans_from_text(node, text):
return False
else:
# Split text in sentences
groups = regex.split(fr'(.*?[\.\!\?\:](?:{se.HAIR_SPACE}…)?[\'"\u201d\u2019]?(?:{se.HAIR_SPACE}\u201d)?\s*)', text, flags=regex.MULTILINE)
groups = regex.split(fr'(.*?[\.\!\?\:](?:{se.HAIR_SPACE}…)?[\'"\u201d\u2019]?(?:{se.HAIR_SPACE}\u201d)?\s*)', text)
# Remove empty strings resulting from split()
groups = [g for g in groups if g != ""]

Expand Down
Loading