From 8665308dec87d33a2fc936021ae08fed4c109f6f Mon Sep 17 00:00:00 2001 From: Andrew Paseltiner Date: Sun, 14 Jul 2024 10:46:36 -0400 Subject: [PATCH] Clean up regex flags - MULTILINE is irrelevant unless ^ is used to match line starts - IGNORECASE is irrelevant when case-agnostic character classes are already being used (e.g. \w or \p{Letter}) or when only punctuation/spaces are being matched - DOTALL is irrelevant unless . is used to match all characters --- se/commands/create_draft.py | 6 +-- se/commands/word_count.py | 2 +- se/formatting.py | 18 ++++----- se/se_epub.py | 2 +- se/se_epub_generate_toc.py | 2 +- se/se_epub_lint.py | 14 +++---- se/typography.py | 56 +++++++++++++-------------- se/vendor/kobo_touch_extended/kobo.py | 2 +- 8 files changed, 51 insertions(+), 51 deletions(-) diff --git a/se/commands/create_draft.py b/se/commands/create_draft.py index 432054b5..78f2e516 100644 --- a/se/commands/create_draft.py +++ b/se/commands/create_draft.py @@ -727,8 +727,8 @@ def _create_draft(args: Namespace, plain_output: bool): producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL) - producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL) - producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL) + producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text) + producers_text = regex.sub(r"[\r\n]+", " ", producers_text) producers_text = regex.sub(r",? and ", ", and ", producers_text) producers_text = producers_text.replace(" and the Online", " and The Online") producers_text = producers_text.replace(", and ", ", ").strip() @@ -945,7 +945,7 @@ def _create_draft(args: Namespace, plain_output: bool): i = i + 1 - metadata_xml = regex.sub(r"\t\tTRANSCRIBER\s*TRANSCRIBER_SORT\s*TRANSCRIBER_URL\s*trc", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL) + metadata_xml = regex.sub(r"\t\tTRANSCRIBER\s*TRANSCRIBER_SORT\s*TRANSCRIBER_URL\s*trc", "\t\t" + producers_xhtml.strip(), metadata_xml) if ebook_wiki_url: metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<") diff --git a/se/commands/word_count.py b/se/commands/word_count.py index 233c92b9..da031163 100644 --- a/se/commands/word_count.py +++ b/se/commands/word_count.py @@ -66,7 +66,7 @@ def word_count(plain_output: bool) -> int: else: # We couldn't generate a dom, fall back to regex replacements - xhtml = regex.sub(r"<(pre|div|p)[^>]*?>[^<]*Project Gutenberg[^<]+?", "", xhtml, flags=regex.IGNORECASE|regex.DOTALL) + xhtml = regex.sub(r"<(pre|div|p)[^>]*?>[^<]*Project Gutenberg[^<]+?", "", xhtml, flags=regex.IGNORECASE) xhtml = regex.sub(r".+?", "", xhtml, flags=regex.IGNORECASE|regex.DOTALL) total_word_count += se.formatting.get_word_count(xhtml) diff --git a/se/formatting.py b/se/formatting.py index 6a1677ef..06b8888f 100644 --- a/se/formatting.py +++ b/se/formatting.py @@ -362,16 +362,16 @@ def get_word_count(xhtml: str) -> int: xhtml = regex.sub(r"<.+?>", " ", xhtml, flags=regex.DOTALL) # Replace some formatting characters - xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL) + xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml) # Remove word-connecting dashes, apostrophes, commas, and slashes (and/or), they count as a word boundry but they shouldn't - xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL) + xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml) # Replace sequential spaces with one space - xhtml = regex.sub(r"\s+", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL) + xhtml = regex.sub(r"\s+", " ", xhtml) # Get the word count - return len(regex.findall(r"\b\w+\b", xhtml, flags=regex.IGNORECASE | regex.DOTALL)) + return len(regex.findall(r"\b\w+\b", xhtml)) def _replace_character_references(match_object) -> str: """Replace most XML character references with literal characters. @@ -660,13 +660,13 @@ def format_xhtml(xhtml: str) -> str: xhtml = regex.sub(r"&#?\w+;", _replace_character_references, xhtml) # Remove unnecessary doctypes which can cause xmllint to hang - xhtml = regex.sub(r"]+?>", "", xhtml, flags=regex.DOTALL) + xhtml = regex.sub(r"]+?>", "", xhtml) # Remove white space between opening/closing tag and text nodes # We do this first so that we can still format line breaks after
# Exclude comments xhtml = regex.sub(r"(<(?:[^!/][^>]*?[^/]|[a-z])>)\s+([^\s<])", r"\1\2", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(r"([^\s>])\s+(]+?>)", r"\1\2", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(r"([^\s>])\s+(]+?>)", r"\1\2", xhtml) try: tree = _format_xml_str(xhtml) @@ -1088,7 +1088,7 @@ def format_css(css: str) -> str: output = regex.sub(r"(@[\p{Letter}]+) \(", "\\1(", output) # Remove empty rules - output = regex.sub(r"^\t*[^\{\}]+?\{\s*\}\n", "", output, flags=regex.DOTALL|regex.MULTILINE) + output = regex.sub(r"^\t*[^\{\}]+?\{\s*\}\n", "", output, flags=regex.MULTILINE) return output @@ -1103,7 +1103,7 @@ def remove_tags(text: str) -> str: A string with all HTML tags removed """ - return regex.sub(r"]*?>", "", text, flags=regex.DOTALL) + return regex.sub(r"]*?>", "", text) def get_ordinal(number: str) -> str: """ @@ -1296,7 +1296,7 @@ def make_url_safe(text: str) -> str: text = regex.sub(r"['‘’`]", "", text) # 5. Convert any non-digit, non-letter character to a space - text = regex.sub(r"[^0-9\p{Letter}]", " ", text, flags=regex.IGNORECASE) + text = regex.sub(r"[^0-9\p{Letter}]", " ", text) # 6. Convert any instance of one or more space to a dash text = regex.sub(r"\s+", "-", text) diff --git a/se/se_epub.py b/se/se_epub.py index b8e55ae0..a34780a3 100644 --- a/se/se_epub.py +++ b/se/se_epub.py @@ -699,7 +699,7 @@ def recompose(self, output_xhtml5: bool, extra_css_file: Union[Path,None] = None output_xhtml = output_xhtml.replace("epub|type", "data-epub-type") output_xhtml = output_xhtml.replace("xml|lang", "lang") output_xhtml = regex.sub(r" xmlns.+?=\".+?\"", "", output_xhtml) - output_xhtml = regex.sub(r"@namespace (epub|xml).+?\s+", "", output_xhtml, flags=regex.MULTILINE) + output_xhtml = regex.sub(r"@namespace (epub|xml).+?\s+", "", output_xhtml) # The Nu HTML5 Validator barfs if non-void elements are self-closed (like ) # Try to un-self-close them for HTML5 output. diff --git a/se/se_epub_generate_toc.py b/se/se_epub_generate_toc.py index 2c5eacb4..542e515f 100644 --- a/se/se_epub_generate_toc.py +++ b/se/se_epub_generate_toc.py @@ -109,7 +109,7 @@ def toc_link(self) -> str: out_string += f"{self.title}\n" # Replace
with a single space - out_string = regex.sub(r"
\s*", " ", out_string, flags=regex.DOTALL) + out_string = regex.sub(r"
\s*", " ", out_string) return out_string diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py index 89b66c8e..b87c11cb 100644 --- a/se/se_epub_lint.py +++ b/se/se_epub_lint.py @@ -1827,7 +1827,7 @@ def _lint_xhtml_syntax_checks(self, filename: Path, dom: se.easy_xml.EasyXmlTree title = regex.sub(r"^[\s\.\,\!\?\:\;]*", "", title) # Normalize whitespace - title = regex.sub(r"\s+", " ", title, flags=regex.DOTALL).strip() + title = regex.sub(r"\s+", " ", title).strip() # Do we have a subtitle? If so the first letter of that must be capitalized, so we pull that out subtitle_matches = regex.findall(r"(.*?)(.*?)(.*?)", title, flags=regex.DOTALL) @@ -2313,7 +2313,7 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, # Check for repeated punctuation, but first remove `&` so we don't match `&,` # Remove tds with repeated ” as they are probably ditto marks - matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"[”\s]+?(.+?)?", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents, flags=regex.IGNORECASE) + matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"[”\s]+?(.+?)?", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents) if matches: messages.append(LintMessage("t-008", "Repeated punctuation.", se.MESSAGE_TYPE_WARNING, filename, matches)) @@ -2608,7 +2608,7 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, messages.append(LintMessage("t-048", "Chapter opening text in all-caps.", se.MESSAGE_TYPE_ERROR, filename, [node.to_string() for node in nodes])) # Check for two-em-dashes used for elision instead of three-em-dashes - matches = regex.findall(fr"[^{se.WORD_JOINER}\p{{Letter}}”]⸺[^“{se.WORD_JOINER}\p{{Letter}}].*", file_contents, flags=regex.MULTILINE) + matches = regex.findall(fr"[^{se.WORD_JOINER}\p{{Letter}}”]⸺[^“{se.WORD_JOINER}\p{{Letter}}].*", file_contents) if matches: messages.append(LintMessage("t-049", "Two-em-dash used for eliding an entire word. Use a three-em-dash instead.", se.MESSAGE_TYPE_WARNING, filename, matches)) @@ -2934,7 +2934,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c # Exclude paragraphs in blockquotes, which may have special quoting rules, and "continued" paragraphs, which may be continued dialog without an “ for node in dom_copy.xpath("/html/body//p[not(ancestor::blockquote) and not(contains(@class, 'continued'))]"): node.set_attr("id", "lint-" + str(node_number)) - temp_xhtml = temp_xhtml + f"

" + regex.sub(r"[\s\n]+", " ", node.inner_text(), flags=regex.DOTALL) + "\n" + temp_xhtml = temp_xhtml + f"

" + regex.sub(r"\s+", " ", node.inner_text()) + "\n" node_number = node_number + 1 replacement_count = 1 @@ -2943,12 +2943,12 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c (temp_xhtml, replacement_count) = regex.subn(r"“[^“]+?”", " ", temp_xhtml) # Remove all regular quotes # Remove contractions to reduce rsquo for next regex - temp_xhtml = regex.sub(r"[\p{Letter}]’[\p{Letter}]", " ", temp_xhtml, flags=regex.MULTILINE) + temp_xhtml = regex.sub(r"[\p{Letter}]’[\p{Letter}]", " ", temp_xhtml) # Remove all runs of ldquo that are likely to spill to the next

replacement_count = 1 while replacement_count > 0: - (temp_xhtml, replacement_count) = regex.subn(r"“[^“”]+?$", " ", temp_xhtml, flags=regex.MULTILINE) + (temp_xhtml, replacement_count) = regex.subn(r"“[^“”]+?$", " ", temp_xhtml) # Match problem `‘` using regex, and if found, get the actual node text from the dom to return. typos = [] @@ -3009,7 +3009,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c # Check for closing rdquo without opening ldquo. # Remove tds in case rdquo means "ditto mark" - typos = regex.findall(r"”[^“‘]+?”", regex.sub(r"]*?>[”\s]+?(.+?)?", "", file_contents), flags=regex.DOTALL) + typos = regex.findall(r"”[^“‘]+?”", regex.sub(r"]*?>[”\s]+?(.+?)?", "", file_contents)) # We create a filter to try to exclude nested quotations # Remove tags in case they're enclosing punctuation we want to match against at the end of a sentence. diff --git a/se/typography.py b/se/typography.py index 11f9dc91..f9d70264 100644 --- a/se/typography.py +++ b/se/typography.py @@ -103,10 +103,10 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str: xhtml = xhtml.replace("——", "⸺") # Smartypants doesn't do well on em dashes followed by open quotes. Fix that here - xhtml = regex.sub(r"—”([\p{Letter}])", r"—“\1", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(r"—’([\p{Letter}])", r"—‘\1", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(r"-“

", r"—”

", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(r"‘”

", fr"’{se.HAIR_SPACE}”

", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(r"—”([\p{Letter}])", r"—“\1", xhtml) + xhtml = regex.sub(r"—’([\p{Letter}])", r"—‘\1", xhtml) + xhtml = regex.sub(r"-“

", r"—”

", xhtml) + xhtml = regex.sub(r"‘”

", fr"’{se.HAIR_SPACE}”

", xhtml) # Now that we've fixed Smartypants' output, put our quotes back in xhtml = xhtml.replace("!#se:rsquo#!", "’") @@ -114,7 +114,7 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str: # Remove spaces between en and em dashes # Note that we match at least one character before the dashes, so that we don't catch start-of-line em dashes like in poetry. # We do a negative lookbehind for
s from being included - xhtml = regex.sub(r"(? str: xhtml = xhtml.replace(se.SHY_HYPHEN, "") # Fix some common em-dash transcription errors - xhtml = regex.sub(r"([:;])-([\p{Letter}])", r"\1—\2", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(r"([\p{Letter}])-“", r"\1—“", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(r"([:;])-([\p{Letter}])", r"\1—\2", xhtml) + xhtml = regex.sub(r"([\p{Letter}])-“", r"\1—“", xhtml) xhtml = regex.sub(r":-]*)([0-9]+)\-([0-9]+)", r"\1–\2", xhtml) @@ -146,7 +146,7 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str: xhtml = regex.sub(fr"([\p{{Lowercase_Letter}}]){se.WORD_JOINER}—th\b", r"\1 —th", xhtml) # Remove word joiners from following opening tags--they're usually never correct - xhtml = regex.sub(fr"<([\p{{Letter}}]+)([^>]*?)>{se.WORD_JOINER}", r"<\1\2>", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(fr"<([\p{{Letter}}]+)([^>]*?)>{se.WORD_JOINER}", r"<\1\2>", xhtml) # Add a word joiner after em dashes within elements xhtml = regex.sub(r"]*?)>—", fr"—{se.WORD_JOINER}", xhtml) @@ -213,7 +213,7 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str: xhtml = regex.sub(r"(\s)‘a’(\s)", r"\1’a’\2", xhtml, flags=regex.IGNORECASE) # Years - xhtml = regex.sub(r"‘([0-9]{2,}[^\p{Letter}0-9’])", r"’\1", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(r"‘([0-9]{2,}[^\p{Letter}0-9’])", r"’\1", xhtml) xhtml = regex.sub(r"‘([Aa]ve|[Oo]me|[Ii]m|[Mm]idst|[Gg]ainst|[Nn]eath|[Ee]m|[Cc]os|[Tt]is|[Tt]isn’t|[Tt]was|[Tt]ain’t|[Tt]wixt|[Tt]were|[Tt]would|[Tt]wouldn|[Tt]won|[Tt]ween|[Tt]will|[Rr]ound|[Pp]on|[Uu]ns?|[Uu]d|[Cc]ept|[Oo]w|[Aa]ppen|[Ee])\b", r"’\1", xhtml) @@ -245,34 +245,34 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str: xhtml = regex.sub(r"(?]*?)>{se.HAIR_SPACE}…", r"…", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(r"\s*\.\s*\.\s*\.\s*", r"…", xhtml) + xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?\.", fr".{se.HAIR_SPACE}…", xhtml) + xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?", fr"{se.HAIR_SPACE}… ", xhtml) + xhtml = regex.sub(fr"]*?)>{se.HAIR_SPACE}…", r"…", xhtml) # Remove spaces between opening tags and ellipses - xhtml = regex.sub(fr"(<[\p{{Letter}}0-9]+[^<]+?>)[\s{se.NO_BREAK_SPACE}]+?…", r"\1…", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(fr"(<[\p{{Letter}}0-9]+[^<]+?>)[\s{se.NO_BREAK_SPACE}]+?…", r"\1…", xhtml) # Remove spaces between closing tags and ellipses - xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?()", r"…\1", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]+([\)”’])(?![\p{{Letter}}])", r"…\1", xhtml, flags=regex.IGNORECASE) # If followed by a letter, the single quote is probably a leading elision - xhtml = regex.sub(fr"([\(“‘])[\s{se.NO_BREAK_SPACE}]+…", r"\1…", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\!\?\.\;\,])", fr"…{se.HAIR_SPACE}\1", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(fr"([\!\?\.\;”’])[\s{se.NO_BREAK_SPACE}]?…", fr"\1{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE) - xhtml = regex.sub(fr"\,[\s{se.NO_BREAK_SPACE}]?…", fr",{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?()", r"…\1", xhtml) + xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]+([\)”’])(?![\p{{Letter}}])", r"…\1", xhtml) # If followed by a letter, the single quote is probably a leading elision + xhtml = regex.sub(fr"([\(“‘])[\s{se.NO_BREAK_SPACE}]+…", r"\1…", xhtml) + xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\!\?\.\;\,])", fr"…{se.HAIR_SPACE}\1", xhtml) + xhtml = regex.sub(fr"([\!\?\.\;”’])[\s{se.NO_BREAK_SPACE}]?…", fr"\1{se.HAIR_SPACE}…", xhtml) + xhtml = regex.sub(fr"\,[\s{se.NO_BREAK_SPACE}]?…", fr",{se.HAIR_SPACE}…", xhtml) # Add nbsp to ellipses that open dialog - xhtml = regex.sub(r"([“‘])…\s([\p{Letter}0-9])", fr"\1…{se.NO_BREAK_SPACE}\2", xhtml, flags=regex.IGNORECASE) + xhtml = regex.sub(r"([“‘])…\s([\p{Letter}0-9])", fr"\1…{se.NO_BREAK_SPACE}\2", xhtml) # Don't use . ... if within a clause xhtml = regex.sub(r"\.(\s…\s[\p{Lowercase_Letter}])", r"\1", xhtml) diff --git a/se/vendor/kobo_touch_extended/kobo.py b/se/vendor/kobo_touch_extended/kobo.py index 860b71db..ca9032fc 100644 --- a/se/vendor/kobo_touch_extended/kobo.py +++ b/se/vendor/kobo_touch_extended/kobo.py @@ -32,7 +32,7 @@ def append_kobo_spans_from_text(node, text): return False else: # Split text in sentences - groups = regex.split(fr'(.*?[\.\!\?\:](?:{se.HAIR_SPACE}…)?[\'"\u201d\u2019]?(?:{se.HAIR_SPACE}\u201d)?\s*)', text, flags=regex.MULTILINE) + groups = regex.split(fr'(.*?[\.\!\?\:](?:{se.HAIR_SPACE}…)?[\'"\u201d\u2019]?(?:{se.HAIR_SPACE}\u201d)?\s*)', text) # Remove empty strings resulting from split() groups = [g for g in groups if g != ""]