Simplify inner_text implementation using lxml's text method

Rather than using regex to remove tags and attributes after the fact. https://lxml.de/api/lxml.etree-module.html#tostring This also eliminates the need to perform HTML unescaping. On my local machine, this reduces the time spent in the inner_text method on Don Quixote from 1.5 seconds to 0.04 seconds.
standardebooks · Jun 22, 2024 · ad56328 · ad56328
1 parent a191db2
commit ad56328
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/se/easy_xml.py b/se/easy_xml.py
@@ -4,7 +4,6 @@
 The class exposes some helpful functions like css_select() and xpath().
 """
 
-from html import unescape
 from typing import Dict, List, Union, Optional
 import unicodedata
 
@@ -414,7 +413,8 @@ def inner_text(self) -> str:
 		`<p>Hello there, <abbr>Mr.</abbr> Smith!</p>` -> `Hello there, Mr. Smith!`
 		"""
 
-		return unescape(regex.sub(r"<[^>]+?>", "", self.inner_xml().strip()))
+		text = etree.tostring(self.lxml_element, encoding=str, method="text", with_tail=False)
+		return text.strip()
 
 	def remove(self) -> None:
 		"""