preserve space in certain elements (#429)

* preserve spaces in code elements Closes #422 * set preserve tags as set and use sanitize * make tests pass * add pre to spacing protected * fix warnings and XML formatting * clean code * simplify code * simplify code --------- Co-authored-by: Adrien Barbaresi <[email protected]> Co-authored-by: Adrien Barbaresi <[email protected]>
adbar · Nov 20, 2023 · abfe094 · abfe094
1 parent 0ebbbcb
commit abfe094
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 28 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -1065,7 +1065,7 @@ def test_code_blocks():
 </code></pre>
 </div>'''
     testresult = extract(highlightjs, config=ZERO_CONFIG, output_format='xml')
-    assert '<code>code\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
+    assert '<code>code\n\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
     github = '''<div class="highlight highlight-source-shell notranslate position-relative overflow-auto" dir="auto"><pre>$ pip install PyGithub</pre><div class="zeroclipboard-container position-absolute right-0 top-0">
     <clipboard-copy aria-label="Copy" class="ClipboardButton btn js-clipboard-copy m-2 p-0 tooltipped-no-delay" data-copy-feedback="Copied!" data-tooltip-direction="w" value="$ pip install PyGithub" tabindex="0" role="button" style="display: inherit;">
       <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon m-2">
@@ -1093,14 +1093,14 @@ def test_code_blocks():
 </div>'''
     testresult = extract(w3schools, config=ZERO_CONFIG, output_format='xml')
     expected = '''<code>
-class Person:<lb/> def __init__(self, name, age):<lb/>
-self.name = name<lb/> self.age = age<lb/><lb/>p1 = Person("John",
-36)<lb/>
-<lb/>print(p1.name)<lb/>print(p1.age) </code>'''
+  class Person:<lb/>\xa0 def __init__(self, name, age):<lb/>\xa0\xa0\xa0 
+  self.name = name<lb/>\xa0\xa0\xa0 self.age = age<lb/><lb/>p1 = Person("John", 
+  36)<lb/>
+  <lb/>print(p1.name)<lb/>print(p1.age) </code>'''
     assert expected in testresult and 'quote' not in testresult
     pip = '''<div><p>Code:</p>
-    <pre lang="python3"><span class="kn">import</span> <span class="nn">openai</span>
-    <span class="kn">from</span> <span class="nn">openai_function_call</span> <span class="kn">import</span> <span class="n">openai_function</span></pre></div>'''
+<pre lang="python3"><span class="kn">import</span> <span class="nn">openai</span>
+<span class="kn">from</span> <span class="nn">openai_function_call</span> <span class="kn">import</span> <span class="n">openai_function</span></pre></div>'''
     expected = '''<code>import openai
 from openai_function_call import openai_function</code>'''
     testresult = extract(pip, config=ZERO_CONFIG, output_format='xml')
@@ -1111,8 +1111,8 @@ class Person:<lb/> def __init__(self, name, age):<lb/>
     testresult = extract(medium_js, config=ZERO_CONFIG, output_format='xml')
     assert expected in testresult and 'quote' not in testresult
     medium_ssr = '''<div><p>Code:</p>
-    <pre class="lw lx ly lz ma nq nr ns bo nt ba bj"><span id="fe48" class="nu mo ev nr b bf nv nw l nx ny">import openai_function<br><br>@openai_functiondef sum(a:int, b:int):<br/>  &quot;&quot;&quot;Sum description adds a + b&quot;&quot;&quot;</span></pre>'''
-    expected = '<code>import openai_function<lb/><lb/>@openai_functiondef sum(a:int, b:int):<lb/> """Sum description adds a + b"""</code>'
+    <pre class="lw lx ly lz ma nq nr ns bo nt ba bj"><span id="fe48" class="nu mo ev nr b bf nv nw l nx ny">import openai_function<br><br>@openai_function<br>def sum(a:int, b:int):<br>  &quot;&quot;&quot;Sum description adds a + b&quot;&quot;&quot;</span></pre>'''
+    expected = '''<code>import openai_function<lb/><lb/>@openai_function<lb/>def sum(a:int, b:int):<lb/>  """Sum description adds a + b"""</code>'''
     testresult = extract(medium_ssr, config=ZERO_CONFIG, output_format='xml')
     assert expected in testresult and 'quote' not in testresult
     code_el = '''<div><p>Code:</p>

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -27,7 +27,7 @@
                              process_node, prune_unwanted_nodes, tree_cleaning)
 from .metadata import Document, extract_metadata
 from .settings import DEFAULT_CONFIG, TAG_CATALOG, use_config
-from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv
+from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED
 from .xml import (build_json_output, build_tei_output, build_xml_output,
                   control_xml_output, remove_empty_elements, strip_double_tags,
                   xmltotxt)
@@ -38,8 +38,6 @@
 
 LOGGER = logging.getLogger(__name__)
 
-FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'td'}
-SPACING_PROTECTED = {'code', 'hi', 'ref'}
 P_FORMATTING = {'hi', 'ref'}
 TABLE_ELEMS = {'td', 'th'}
 TABLE_ALL = {'td', 'th', 'hi'}
@@ -942,7 +940,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
                             include_comments, include_formatting, include_links,
                             include_images, include_tables, deduplicate,
                             target_language)
-        
+
         # prune all xpath expressions that user specified
         # no backup as this is unetre full control of the user
         if prune_xpath is not None:

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -74,6 +74,9 @@
 
 STRIP_EXTENSION = re.compile(r"\.[^/?#]{2,63}$")
 
+FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'ref', 'td'}
+SPACING_PROTECTED = {'code', 'pre'}
+
 
 def handle_compressed_file(filecontent):
     """Tell if a file's magic number corresponds to the GZip format
@@ -257,29 +260,56 @@ def normalize_unicode(string, unicodeform='NFC'):
 
 
 @lru_cache(maxsize=1024)
-def line_processing(line):
+def line_processing(line, preserve_space=False, trailing_space=False):
     '''Remove HTML space entities, then discard incompatible unicode
        and invalid XML characters on line level'''
     # spacing HTML entities: https://www.w3.org/MarkUp/html-spec/html-spec_13.html
     # unique code spaces
-    line = line.replace('&#13;', '\r').replace('&#10;', '\n').replace('&nbsp;', '\u00A0').replace(';cs;', ' ')
-    # remove newlines that are not related to punctuation or markup
-    # remove non-printable chars and normalize space characters (including Unicode spaces)
-    line = trim(remove_control_characters(LINES_TRIMMING.sub(r' ', line)))
-    # prune empty lines
-    if all(map(str.isspace, line)):
-        line = None
-    return line
-
-
-def sanitize(text):
+    new_line = remove_control_characters(line.replace('&#13;', '\r').replace('&#10;', '\n').replace('&nbsp;', '\u00A0'))
+    if not preserve_space:
+        # remove newlines that are not related to punctuation or markup
+        # remove non-printable chars and normalize space characters (including Unicode spaces)
+        new_line = trim(LINES_TRIMMING.sub(r" ", new_line))
+        # prune empty lines
+        if all(map(str.isspace, new_line)):
+            new_line = None
+        elif trailing_space:
+            space_before = " " if line[0] == " " else ""
+            space_after = " " if line[-1] == " " else ""
+            new_line = "".join([space_before, new_line, space_after])
+    return new_line
+
+
+def sanitize(text, preserve_space=False, trailing_space=False):
     '''Convert text and discard incompatible and invalid characters'''
+    # consider all text as a single line
+    if trailing_space:
+        return line_processing(text, preserve_space, True)
+    # process line by line
     try:
-        return '\n'.join(filter(None, (line_processing(l) for l in text.splitlines())))
+        return '\n'.join(filter(None, (line_processing(l, preserve_space) for l in text.splitlines())))
     except AttributeError:
         return None
 
 
+def sanitize_tree(tree):
+    '''Trims spaces, removes control characters and normalizes unicode'''
+    for elem in tree.iter():
+        parent = elem.getparent()
+        parent_tag = parent.tag if parent is not None else ""
+
+        # preserve space if the element or its parent is a specific tag, or if the element has text and children
+        # the last part is relevant for item elements with ref inside for example
+        preserve_space = elem.tag in SPACING_PROTECTED or parent_tag in SPACING_PROTECTED
+        trailing_space = elem.tag in FORMATTING_PROTECTED or parent_tag in FORMATTING_PROTECTED or preserve_space
+
+        if elem.text:
+            elem.text = sanitize(elem.text, preserve_space, trailing_space)
+        if elem.tail:
+            elem.tail = sanitize(elem.tail, preserve_space, trailing_space)
+    return tree
+
+
 @lru_cache(maxsize=1024)
 def trim(string):
     '''Remove unnecessary spaces within a text string'''

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -18,7 +18,7 @@
 
 from . import __version__
 from .filters import text_chars_test
-from .utils import sanitize
+from .utils import sanitize, sanitize_tree
 
 LOGGER = logging.getLogger(__name__)
 # validation
@@ -117,8 +117,9 @@ def build_xml_output(docmeta):
 
 def control_xml_output(output_tree, output_format, tei_validation, docmeta):
     '''Make sure the XML output is conform and valid if required'''
-    control_string = sanitize(tostring(output_tree, encoding='unicode'))
+    output_tree = sanitize_tree(output_tree)
     # necessary for cleaning
+    control_string = tostring(output_tree, encoding='unicode')
     output_tree = fromstring(control_string, CONTROL_PARSER)
     # validate
     if output_format == 'xmltei' and tei_validation is True: