From 8665308dec87d33a2fc936021ae08fed4c109f6f Mon Sep 17 00:00:00 2001
From: Andrew Paseltiner " + regex.sub(r"[\s\n]+", " ", node.inner_text(), flags=regex.DOTALL) + "\n"
+ temp_xhtml = temp_xhtml + f" " + regex.sub(r"\s+", " ", node.inner_text()) + "\n"
node_number = node_number + 1
replacement_count = 1
@@ -2943,12 +2943,12 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
(temp_xhtml, replacement_count) = regex.subn(r"“[^“]+?”", " ", temp_xhtml) # Remove all regular quotes
# Remove contractions to reduce rsquo for next regex
- temp_xhtml = regex.sub(r"[\p{Letter}]’[\p{Letter}]", " ", temp_xhtml, flags=regex.MULTILINE)
+ temp_xhtml = regex.sub(r"[\p{Letter}]’[\p{Letter}]", " ", temp_xhtml)
# Remove all runs of ldquo that are likely to spill to the next
replacement_count = 1
while replacement_count > 0:
- (temp_xhtml, replacement_count) = regex.subn(r"“[^“”]+?$", " ", temp_xhtml, flags=regex.MULTILINE)
+ (temp_xhtml, replacement_count) = regex.subn(r"“[^“”]+?$", " ", temp_xhtml)
# Match problem `‘` using regex, and if found, get the actual node text from the dom to return.
typos = []
@@ -3009,7 +3009,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
# Check for closing rdquo without opening ldquo.
# Remove tds in case rdquo means "ditto mark"
- typos = regex.findall(r"”[^“‘]+?”", regex.sub(r"
# Exclude comments
xhtml = regex.sub(r"(<(?:[^!/][^>]*?[^/]|[a-z])>)\s+([^\s<])", r"\1\2", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(r"([^\s>])\s+([^>]+?>)", r"\1\2", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(r"([^\s>])\s+([^>]+?>)", r"\1\2", xhtml)
try:
tree = _format_xml_str(xhtml)
@@ -1088,7 +1088,7 @@ def format_css(css: str) -> str:
output = regex.sub(r"(@[\p{Letter}]+) \(", "\\1(", output)
# Remove empty rules
- output = regex.sub(r"^\t*[^\{\}]+?\{\s*\}\n", "", output, flags=regex.DOTALL|regex.MULTILINE)
+ output = regex.sub(r"^\t*[^\{\}]+?\{\s*\}\n", "", output, flags=regex.MULTILINE)
return output
@@ -1103,7 +1103,7 @@ def remove_tags(text: str) -> str:
A string with all HTML tags removed
"""
- return regex.sub(r"?[\p{Letter}]+[^>]*?>", "", text, flags=regex.DOTALL)
+ return regex.sub(r"?[\p{Letter}]+[^>]*?>", "", text)
def get_ordinal(number: str) -> str:
"""
@@ -1296,7 +1296,7 @@ def make_url_safe(text: str) -> str:
text = regex.sub(r"['‘’`]", "", text)
# 5. Convert any non-digit, non-letter character to a space
- text = regex.sub(r"[^0-9\p{Letter}]", " ", text, flags=regex.IGNORECASE)
+ text = regex.sub(r"[^0-9\p{Letter}]", " ", text)
# 6. Convert any instance of one or more space to a dash
text = regex.sub(r"\s+", "-", text)
diff --git a/se/se_epub.py b/se/se_epub.py
index b8e55ae0..a34780a3 100644
--- a/se/se_epub.py
+++ b/se/se_epub.py
@@ -699,7 +699,7 @@ def recompose(self, output_xhtml5: bool, extra_css_file: Union[Path,None] = None
output_xhtml = output_xhtml.replace("epub|type", "data-epub-type")
output_xhtml = output_xhtml.replace("xml|lang", "lang")
output_xhtml = regex.sub(r" xmlns.+?=\".+?\"", "", output_xhtml)
- output_xhtml = regex.sub(r"@namespace (epub|xml).+?\s+", "", output_xhtml, flags=regex.MULTILINE)
+ output_xhtml = regex.sub(r"@namespace (epub|xml).+?\s+", "", output_xhtml)
# The Nu HTML5 Validator barfs if non-void elements are self-closed (like )
# Try to un-self-close them for HTML5 output.
diff --git a/se/se_epub_generate_toc.py b/se/se_epub_generate_toc.py
index 2c5eacb4..542e515f 100644
--- a/se/se_epub_generate_toc.py
+++ b/se/se_epub_generate_toc.py
@@ -109,7 +109,7 @@ def toc_link(self) -> str:
out_string += f"{self.title}\n"
# Replace
with a single space
- out_string = regex.sub(r"
\s*", " ", out_string, flags=regex.DOTALL)
+ out_string = regex.sub(r"
\s*", " ", out_string)
return out_string
diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py
index 89b66c8e..b87c11cb 100644
--- a/se/se_epub_lint.py
+++ b/se/se_epub_lint.py
@@ -1827,7 +1827,7 @@ def _lint_xhtml_syntax_checks(self, filename: Path, dom: se.easy_xml.EasyXmlTree
title = regex.sub(r"^[\s\.\,\!\?\:\;]*", "", title)
# Normalize whitespace
- title = regex.sub(r"\s+", " ", title, flags=regex.DOTALL).strip()
+ title = regex.sub(r"\s+", " ", title).strip()
# Do we have a subtitle? If so the first letter of that must be capitalized, so we pull that out
subtitle_matches = regex.findall(r"(.*?)(.*?)(.*?)", title, flags=regex.DOTALL)
@@ -2313,7 +2313,7 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,
# Check for repeated punctuation, but first remove `&` so we don't match `&,`
# Remove tds with repeated ” as they are probably ditto marks
- matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"[”\s]+?(.+?)? ", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents, flags=regex.IGNORECASE)
+ matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"[”\s]+?(.+?)? ", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents)
if matches:
messages.append(LintMessage("t-008", "Repeated punctuation.", se.MESSAGE_TYPE_WARNING, filename, matches))
@@ -2608,7 +2608,7 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,
messages.append(LintMessage("t-048", "Chapter opening text in all-caps.", se.MESSAGE_TYPE_ERROR, filename, [node.to_string() for node in nodes]))
# Check for two-em-dashes used for elision instead of three-em-dashes
- matches = regex.findall(fr"[^{se.WORD_JOINER}\p{{Letter}}”]⸺[^“{se.WORD_JOINER}\p{{Letter}}].*", file_contents, flags=regex.MULTILINE)
+ matches = regex.findall(fr"[^{se.WORD_JOINER}\p{{Letter}}”]⸺[^“{se.WORD_JOINER}\p{{Letter}}].*", file_contents)
if matches:
messages.append(LintMessage("t-049", "Two-em-dash used for eliding an entire word. Use a three-em-dash instead.", se.MESSAGE_TYPE_WARNING, filename, matches))
@@ -2934,7 +2934,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
# Exclude paragraphs in blockquotes, which may have special quoting rules, and "continued" paragraphs, which may be continued dialog without an “
for node in dom_copy.xpath("/html/body//p[not(ancestor::blockquote) and not(contains(@class, 'continued'))]"):
node.set_attr("id", "lint-" + str(node_number))
- temp_xhtml = temp_xhtml + f"]*?>[”\s]+?(.+?)? ", "", file_contents), flags=regex.DOTALL)
+ typos = regex.findall(r"”[^“‘]+?”", regex.sub(r"]*?>[”\s]+?(.+?)? ", "", file_contents))
# We create a filter to try to exclude nested quotations
# Remove tags in case they're enclosing punctuation we want to match against at the end of a sentence.
diff --git a/se/typography.py b/se/typography.py
index 11f9dc91..f9d70264 100644
--- a/se/typography.py
+++ b/se/typography.py
@@ -103,10 +103,10 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = xhtml.replace("——", "⸺")
# Smartypants doesn't do well on em dashes followed by open quotes. Fix that here
- xhtml = regex.sub(r"—”([\p{Letter}])", r"—“\1", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(r"—’([\p{Letter}])", r"—‘\1", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(r"-“
…", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(r"\s*\.\s*\.\s*\.\s*", r"…", xhtml) + xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?\.", fr".{se.HAIR_SPACE}…", xhtml) + xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?", fr"{se.HAIR_SPACE}… ", xhtml) + xhtml = regex.sub(fr"
]*?)>{se.HAIR_SPACE}…", r"
…", xhtml) # Remove spaces between opening tags and ellipses - xhtml = regex.sub(fr"(<[\p{{Letter}}0-9]+[^<]+?>)[\s{se.NO_BREAK_SPACE}]+?…", r"\1…", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(fr"(<[\p{{Letter}}0-9]+[^<]+?>)[\s{se.NO_BREAK_SPACE}]+?…", r"\1…", xhtml) # Remove spaces between closing tags and ellipses - xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\p{{Letter}}0-9]+>)", r"…\1", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]+([\)”’])(?![\p{{Letter}}])", r"…\1", xhtml, flags=regex.IGNORECASE) # If followed by a letter, the single quote is probably a leading elision - xhtml = regex.sub(fr"([\(“‘])[\s{se.NO_BREAK_SPACE}]+…", r"\1…", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\!\?\.\;\,])", fr"…{se.HAIR_SPACE}\1", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(fr"([\!\?\.\;”’])[\s{se.NO_BREAK_SPACE}]?…", fr"\1{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(fr"\,[\s{se.NO_BREAK_SPACE}]?…", fr",{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\p{{Letter}}0-9]+>)", r"…\1", xhtml) + xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]+([\)”’])(?![\p{{Letter}}])", r"…\1", xhtml) # If followed by a letter, the single quote is probably a leading elision + xhtml = regex.sub(fr"([\(“‘])[\s{se.NO_BREAK_SPACE}]+…", r"\1…", xhtml) + xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\!\?\.\;\,])", fr"…{se.HAIR_SPACE}\1", xhtml) + xhtml = regex.sub(fr"([\!\?\.\;”’])[\s{se.NO_BREAK_SPACE}]?…", fr"\1{se.HAIR_SPACE}…", xhtml) + xhtml = regex.sub(fr"\,[\s{se.NO_BREAK_SPACE}]?…", fr",{se.HAIR_SPACE}…", xhtml) # Add nbsp to ellipses that open dialog - xhtml = regex.sub(r"([“‘])…\s([\p{Letter}0-9])", fr"\1…{se.NO_BREAK_SPACE}\2", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(r"([“‘])…\s([\p{Letter}0-9])", fr"\1…{se.NO_BREAK_SPACE}\2", xhtml) # Don't use . ... if within a clause xhtml = regex.sub(r"\.(\s…\s[\p{Lowercase_Letter}])", r"\1", xhtml) diff --git a/se/vendor/kobo_touch_extended/kobo.py b/se/vendor/kobo_touch_extended/kobo.py index 860b71db..ca9032fc 100644 --- a/se/vendor/kobo_touch_extended/kobo.py +++ b/se/vendor/kobo_touch_extended/kobo.py @@ -32,7 +32,7 @@ def append_kobo_spans_from_text(node, text): return False else: # Split text in sentences - groups = regex.split(fr'(.*?[\.\!\?\:](?:{se.HAIR_SPACE}…)?[\'"\u201d\u2019]?(?:{se.HAIR_SPACE}\u201d)?\s*)', text, flags=regex.MULTILINE) + groups = regex.split(fr'(.*?[\.\!\?\:](?:{se.HAIR_SPACE}…)?[\'"\u201d\u2019]?(?:{se.HAIR_SPACE}\u201d)?\s*)', text) # Remove empty strings resulting from split() groups = [g for g in groups if g != ""]