Skip to content

Commit

Permalink
preserve space in certain elements (#429)
Browse files Browse the repository at this point in the history
* preserve spaces in code elements

Closes #422

* set preserve tags as set and use sanitize

* make tests pass

* add pre to spacing protected

* fix warnings and XML formatting

* clean code

* simplify code

* simplify code

---------

Co-authored-by: Adrien Barbaresi <[email protected]>
Co-authored-by: Adrien Barbaresi <[email protected]>
  • Loading branch information
3 people authored Nov 20, 2023
1 parent 0ebbbcb commit abfe094
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 28 deletions.
18 changes: 9 additions & 9 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,7 @@ def test_code_blocks():
</code></pre>
</div>'''
testresult = extract(highlightjs, config=ZERO_CONFIG, output_format='xml')
assert '<code>code\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
assert '<code>code\n\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
github = '''<div class="highlight highlight-source-shell notranslate position-relative overflow-auto" dir="auto"><pre>$ pip install PyGithub</pre><div class="zeroclipboard-container position-absolute right-0 top-0">
<clipboard-copy aria-label="Copy" class="ClipboardButton btn js-clipboard-copy m-2 p-0 tooltipped-no-delay" data-copy-feedback="Copied!" data-tooltip-direction="w" value="$ pip install PyGithub" tabindex="0" role="button" style="display: inherit;">
<svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon m-2">
Expand Down Expand Up @@ -1093,14 +1093,14 @@ def test_code_blocks():
</div>'''
testresult = extract(w3schools, config=ZERO_CONFIG, output_format='xml')
expected = '''<code>
class Person:<lb/> def __init__(self, name, age):<lb/>
self.name = name<lb/> self.age = age<lb/><lb/>p1 = Person("John",
36)<lb/>
<lb/>print(p1.name)<lb/>print(p1.age) </code>'''
class Person:<lb/>\xa0 def __init__(self, name, age):<lb/>\xa0\xa0\xa0
self.name = name<lb/>\xa0\xa0\xa0 self.age = age<lb/><lb/>p1 = Person("John",
36)<lb/>
<lb/>print(p1.name)<lb/>print(p1.age) </code>'''
assert expected in testresult and 'quote' not in testresult
pip = '''<div><p>Code:</p>
<pre lang="python3"><span class="kn">import</span> <span class="nn">openai</span>
<span class="kn">from</span> <span class="nn">openai_function_call</span> <span class="kn">import</span> <span class="n">openai_function</span></pre></div>'''
<pre lang="python3"><span class="kn">import</span> <span class="nn">openai</span>
<span class="kn">from</span> <span class="nn">openai_function_call</span> <span class="kn">import</span> <span class="n">openai_function</span></pre></div>'''
expected = '''<code>import openai
from openai_function_call import openai_function</code>'''
testresult = extract(pip, config=ZERO_CONFIG, output_format='xml')
Expand All @@ -1111,8 +1111,8 @@ class Person:<lb/> def __init__(self, name, age):<lb/>
testresult = extract(medium_js, config=ZERO_CONFIG, output_format='xml')
assert expected in testresult and 'quote' not in testresult
medium_ssr = '''<div><p>Code:</p>
<pre class="lw lx ly lz ma nq nr ns bo nt ba bj"><span id="fe48" class="nu mo ev nr b bf nv nw l nx ny">import openai_function<br><br>@openai_functiondef sum(a:int, b:int):<br/> &quot;&quot;&quot;Sum description adds a + b&quot;&quot;&quot;</span></pre>'''
expected = '<code>import openai_function<lb/><lb/>@openai_functiondef sum(a:int, b:int):<lb/> """Sum description adds a + b"""</code>'
<pre class="lw lx ly lz ma nq nr ns bo nt ba bj"><span id="fe48" class="nu mo ev nr b bf nv nw l nx ny">import openai_function<br><br>@openai_function<br>def sum(a:int, b:int):<br> &quot;&quot;&quot;Sum description adds a + b&quot;&quot;&quot;</span></pre>'''
expected = '''<code>import openai_function<lb/><lb/>@openai_function<lb/>def sum(a:int, b:int):<lb/> """Sum description adds a + b"""</code>'''
testresult = extract(medium_ssr, config=ZERO_CONFIG, output_format='xml')
assert expected in testresult and 'quote' not in testresult
code_el = '''<div><p>Code:</p>
Expand Down
6 changes: 2 additions & 4 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
process_node, prune_unwanted_nodes, tree_cleaning)
from .metadata import Document, extract_metadata
from .settings import DEFAULT_CONFIG, TAG_CATALOG, use_config
from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv
from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED
from .xml import (build_json_output, build_tei_output, build_xml_output,
control_xml_output, remove_empty_elements, strip_double_tags,
xmltotxt)
Expand All @@ -38,8 +38,6 @@

LOGGER = logging.getLogger(__name__)

FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'td'}
SPACING_PROTECTED = {'code', 'hi', 'ref'}
P_FORMATTING = {'hi', 'ref'}
TABLE_ELEMS = {'td', 'th'}
TABLE_ALL = {'td', 'th', 'hi'}
Expand Down Expand Up @@ -942,7 +940,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
include_comments, include_formatting, include_links,
include_images, include_tables, deduplicate,
target_language)

# prune all xpath expressions that user specified
# no backup as this is unetre full control of the user
if prune_xpath is not None:
Expand Down
56 changes: 43 additions & 13 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@

STRIP_EXTENSION = re.compile(r"\.[^/?#]{2,63}$")

FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'ref', 'td'}
SPACING_PROTECTED = {'code', 'pre'}


def handle_compressed_file(filecontent):
"""Tell if a file's magic number corresponds to the GZip format
Expand Down Expand Up @@ -257,29 +260,56 @@ def normalize_unicode(string, unicodeform='NFC'):


@lru_cache(maxsize=1024)
def line_processing(line):
def line_processing(line, preserve_space=False, trailing_space=False):
'''Remove HTML space entities, then discard incompatible unicode
and invalid XML characters on line level'''
# spacing HTML entities: https://www.w3.org/MarkUp/html-spec/html-spec_13.html
# unique code spaces
line = line.replace('&#13;', '\r').replace('&#10;', '\n').replace('&nbsp;', '\u00A0').replace(';cs;', ' ')
# remove newlines that are not related to punctuation or markup
# remove non-printable chars and normalize space characters (including Unicode spaces)
line = trim(remove_control_characters(LINES_TRIMMING.sub(r' ', line)))
# prune empty lines
if all(map(str.isspace, line)):
line = None
return line


def sanitize(text):
new_line = remove_control_characters(line.replace('&#13;', '\r').replace('&#10;', '\n').replace('&nbsp;', '\u00A0'))
if not preserve_space:
# remove newlines that are not related to punctuation or markup
# remove non-printable chars and normalize space characters (including Unicode spaces)
new_line = trim(LINES_TRIMMING.sub(r" ", new_line))
# prune empty lines
if all(map(str.isspace, new_line)):
new_line = None
elif trailing_space:
space_before = " " if line[0] == " " else ""
space_after = " " if line[-1] == " " else ""
new_line = "".join([space_before, new_line, space_after])
return new_line


def sanitize(text, preserve_space=False, trailing_space=False):
'''Convert text and discard incompatible and invalid characters'''
# consider all text as a single line
if trailing_space:
return line_processing(text, preserve_space, True)
# process line by line
try:
return '\n'.join(filter(None, (line_processing(l) for l in text.splitlines())))
return '\n'.join(filter(None, (line_processing(l, preserve_space) for l in text.splitlines())))
except AttributeError:
return None


def sanitize_tree(tree):
'''Trims spaces, removes control characters and normalizes unicode'''
for elem in tree.iter():
parent = elem.getparent()
parent_tag = parent.tag if parent is not None else ""

# preserve space if the element or its parent is a specific tag, or if the element has text and children
# the last part is relevant for item elements with ref inside for example
preserve_space = elem.tag in SPACING_PROTECTED or parent_tag in SPACING_PROTECTED
trailing_space = elem.tag in FORMATTING_PROTECTED or parent_tag in FORMATTING_PROTECTED or preserve_space

if elem.text:
elem.text = sanitize(elem.text, preserve_space, trailing_space)
if elem.tail:
elem.tail = sanitize(elem.tail, preserve_space, trailing_space)
return tree


@lru_cache(maxsize=1024)
def trim(string):
'''Remove unnecessary spaces within a text string'''
Expand Down
5 changes: 3 additions & 2 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from . import __version__
from .filters import text_chars_test
from .utils import sanitize
from .utils import sanitize, sanitize_tree

LOGGER = logging.getLogger(__name__)
# validation
Expand Down Expand Up @@ -117,8 +117,9 @@ def build_xml_output(docmeta):

def control_xml_output(output_tree, output_format, tei_validation, docmeta):
'''Make sure the XML output is conform and valid if required'''
control_string = sanitize(tostring(output_tree, encoding='unicode'))
output_tree = sanitize_tree(output_tree)
# necessary for cleaning
control_string = tostring(output_tree, encoding='unicode')
output_tree = fromstring(control_string, CONTROL_PARSER)
# validate
if output_format == 'xmltei' and tei_validation is True:
Expand Down

0 comments on commit abfe094

Please sign in to comment.