diff --git a/se/formatting.py b/se/formatting.py index b53fa700..4799f8b5 100644 --- a/se/formatting.py +++ b/se/formatting.py @@ -105,6 +105,7 @@ def semanticate(xhtml: str) -> str: xhtml = regex.sub(r"(?]*?\>))\b([1-4]D)\b", r"""\1""", xhtml) xhtml = regex.sub(r"(?]*?\>))(Thos\.|Jas\.|Chas\.|Wm\.)", r"""\1""", xhtml) xhtml = regex.sub(r"(?]*?\>))([ap])\.\s?m\.", r"\1.m.", xhtml) + xhtml = regex.sub(r"(?]*?\>))(4to|8vo|12mo|16mo|18mo|32mo|48mo|64mo)(?:\.(\s+\p{Lowercase_Letter}))?", r"\1\2", xhtml) # Book sizes xhtml = regex.sub(r"(?]*?\>))([0-9]{1,2})\s?[Aa]\.?\s?[Mm](?:\.|\b)", r"\1 a.m.", xhtml) xhtml = regex.sub(r"(?]*?\>))([0-9]{1,2})\s?[Pp]\.?\s?[Mm](?:\.|\b)", r"\1 p.m.", xhtml) # this should be placed after the am/pm test, to prevent tagging just the p. in "p. m."