From abfe094b8593bd51f1fa1644722c4463b4a03460 Mon Sep 17 00:00:00 2001 From: Ido Shamun Date: Mon, 20 Nov 2023 18:41:56 +0200 Subject: [PATCH] preserve space in certain elements (#429) * preserve spaces in code elements Closes #422 * set preserve tags as set and use sanitize * make tests pass * add pre to spacing protected * fix warnings and XML formatting * clean code * simplify code * simplify code --------- Co-authored-by: Adrien Barbaresi Co-authored-by: Adrien Barbaresi --- tests/unit_tests.py | 18 +++++++------- trafilatura/core.py | 6 ++--- trafilatura/utils.py | 56 ++++++++++++++++++++++++++++++++++---------- trafilatura/xml.py | 5 ++-- 4 files changed, 57 insertions(+), 28 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 73224f80..ed4bb046 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1065,7 +1065,7 @@ def test_code_blocks(): ''' testresult = extract(highlightjs, config=ZERO_CONFIG, output_format='xml') - assert 'code\nhighlighted more code\n' in testresult and 'quote' not in testresult + assert 'code\n\nhighlighted more code\n' in testresult and 'quote' not in testresult github = '''
$ pip install PyGithub
''' testresult = extract(w3schools, config=ZERO_CONFIG, output_format='xml') expected = ''' -class Person: def __init__(self, name, age): -self.name = name self.age = agep1 = Person("John", -36) -print(p1.name)print(p1.age) ''' + class Person:\xa0 def __init__(self, name, age):\xa0\xa0\xa0 + self.name = name\xa0\xa0\xa0 self.age = agep1 = Person("John", + 36) + print(p1.name)print(p1.age) ''' assert expected in testresult and 'quote' not in testresult pip = '''

Code:

-
import openai
-    from openai_function_call import openai_function
''' +
import openai
+from openai_function_call import openai_function
''' expected = '''import openai from openai_function_call import openai_function''' testresult = extract(pip, config=ZERO_CONFIG, output_format='xml') @@ -1111,8 +1111,8 @@ class Person: def __init__(self, name, age): testresult = extract(medium_js, config=ZERO_CONFIG, output_format='xml') assert expected in testresult and 'quote' not in testresult medium_ssr = '''

Code:

-
import openai_function

@openai_functiondef sum(a:int, b:int):
"""Sum description adds a + b"""
''' - expected = 'import openai_function@openai_functiondef sum(a:int, b:int): """Sum description adds a + b"""' +
import openai_function

@openai_function
def sum(a:int, b:int):
"""Sum description adds a + b"""
''' + expected = '''import openai_function@openai_functiondef sum(a:int, b:int): """Sum description adds a + b"""''' testresult = extract(medium_ssr, config=ZERO_CONFIG, output_format='xml') assert expected in testresult and 'quote' not in testresult code_el = '''

Code:

diff --git a/trafilatura/core.py b/trafilatura/core.py index 2b91fbb3..f27638b2 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -27,7 +27,7 @@ process_node, prune_unwanted_nodes, tree_cleaning) from .metadata import Document, extract_metadata from .settings import DEFAULT_CONFIG, TAG_CATALOG, use_config -from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv +from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED from .xml import (build_json_output, build_tei_output, build_xml_output, control_xml_output, remove_empty_elements, strip_double_tags, xmltotxt) @@ -38,8 +38,6 @@ LOGGER = logging.getLogger(__name__) -FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'td'} -SPACING_PROTECTED = {'code', 'hi', 'ref'} P_FORMATTING = {'hi', 'ref'} TABLE_ELEMS = {'td', 'th'} TABLE_ALL = {'td', 'th', 'hi'} @@ -942,7 +940,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False, include_comments, include_formatting, include_links, include_images, include_tables, deduplicate, target_language) - + # prune all xpath expressions that user specified # no backup as this is unetre full control of the user if prune_xpath is not None: diff --git a/trafilatura/utils.py b/trafilatura/utils.py index c7b9242d..3b8bbc04 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -74,6 +74,9 @@ STRIP_EXTENSION = re.compile(r"\.[^/?#]{2,63}$") +FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'ref', 'td'} +SPACING_PROTECTED = {'code', 'pre'} + def handle_compressed_file(filecontent): """Tell if a file's magic number corresponds to the GZip format @@ -257,29 +260,56 @@ def normalize_unicode(string, unicodeform='NFC'): @lru_cache(maxsize=1024) -def line_processing(line): +def line_processing(line, preserve_space=False, trailing_space=False): '''Remove HTML space entities, then discard incompatible unicode and invalid XML characters on line level''' # spacing HTML entities: https://www.w3.org/MarkUp/html-spec/html-spec_13.html # unique code spaces - line = line.replace(' ', '\r').replace(' ', '\n').replace(' ', '\u00A0').replace(';cs;', ' ') - # remove newlines that are not related to punctuation or markup - # remove non-printable chars and normalize space characters (including Unicode spaces) - line = trim(remove_control_characters(LINES_TRIMMING.sub(r' ', line))) - # prune empty lines - if all(map(str.isspace, line)): - line = None - return line - - -def sanitize(text): + new_line = remove_control_characters(line.replace(' ', '\r').replace(' ', '\n').replace(' ', '\u00A0')) + if not preserve_space: + # remove newlines that are not related to punctuation or markup + # remove non-printable chars and normalize space characters (including Unicode spaces) + new_line = trim(LINES_TRIMMING.sub(r" ", new_line)) + # prune empty lines + if all(map(str.isspace, new_line)): + new_line = None + elif trailing_space: + space_before = " " if line[0] == " " else "" + space_after = " " if line[-1] == " " else "" + new_line = "".join([space_before, new_line, space_after]) + return new_line + + +def sanitize(text, preserve_space=False, trailing_space=False): '''Convert text and discard incompatible and invalid characters''' + # consider all text as a single line + if trailing_space: + return line_processing(text, preserve_space, True) + # process line by line try: - return '\n'.join(filter(None, (line_processing(l) for l in text.splitlines()))) + return '\n'.join(filter(None, (line_processing(l, preserve_space) for l in text.splitlines()))) except AttributeError: return None +def sanitize_tree(tree): + '''Trims spaces, removes control characters and normalizes unicode''' + for elem in tree.iter(): + parent = elem.getparent() + parent_tag = parent.tag if parent is not None else "" + + # preserve space if the element or its parent is a specific tag, or if the element has text and children + # the last part is relevant for item elements with ref inside for example + preserve_space = elem.tag in SPACING_PROTECTED or parent_tag in SPACING_PROTECTED + trailing_space = elem.tag in FORMATTING_PROTECTED or parent_tag in FORMATTING_PROTECTED or preserve_space + + if elem.text: + elem.text = sanitize(elem.text, preserve_space, trailing_space) + if elem.tail: + elem.tail = sanitize(elem.tail, preserve_space, trailing_space) + return tree + + @lru_cache(maxsize=1024) def trim(string): '''Remove unnecessary spaces within a text string''' diff --git a/trafilatura/xml.py b/trafilatura/xml.py index 33592aa4..8cc5fc84 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -18,7 +18,7 @@ from . import __version__ from .filters import text_chars_test -from .utils import sanitize +from .utils import sanitize, sanitize_tree LOGGER = logging.getLogger(__name__) # validation @@ -117,8 +117,9 @@ def build_xml_output(docmeta): def control_xml_output(output_tree, output_format, tei_validation, docmeta): '''Make sure the XML output is conform and valid if required''' - control_string = sanitize(tostring(output_tree, encoding='unicode')) + output_tree = sanitize_tree(output_tree) # necessary for cleaning + control_string = tostring(output_tree, encoding='unicode') output_tree = fromstring(control_string, CONTROL_PARSER) # validate if output_format == 'xmltei' and tei_validation is True: