extraction fix: images in text nodes (#757)

* refine table markdown output * fix ut * extract image in textnode --------- Co-authored-by: CodyInnowhere <[email protected]>
adbar · Dec 3, 2024 · b7bfcc3 · b7bfcc3
1 parent 4e59c8a
commit b7bfcc3
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 2 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -483,6 +483,7 @@ def test_images():
     assert is_image_file('test.txt') is False
     assert is_image_file('test.jpg'*2000) is False  # length threshold
     # tag with attributes
+    assert handle_image(None) is None
     assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None
     assert handle_image(html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>')) is not None
     assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None
@@ -494,6 +495,12 @@ def test_images():
     assert '![Example image](test.jpg)' in extract(teststring, include_images=True, fast=True)
     assert '<graphic src="test.jpg" title="Example image"/>' in extract(teststring, include_images=True, fast=True, output_format='xml', config=ZERO_CONFIG)
     assert extract('<html><body><article><img data-src="test.jpg" alt="text" title="a title"/></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
+    assert extract('<html><body><article><p><img data-src="test.jpg" alt="text" title="a title"/></p></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
+    assert extract('<html><body><article><p><img other="test.jpg" alt="text" title="a title"/></p></article></body></html>', include_images=True, fast=True) == ''
+    assert extract('<html><body><article><div><p><img data-src="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
+    assert extract('<html><body><article><div><p><img data-src-small="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
+
+    assert handle_image(html.fromstring('<img src="data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" alt="text"></img>')) is None
 
     # CNN example
     mydoc = html.fromstring('<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781" src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-eq-state="mini xsmall small medium" data-src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg">')

diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
@@ -20,7 +20,7 @@
     MANUALLY_CLEANED,
     MANUALLY_STRIPPED,
 )
-from .utils import textfilter, trim
+from .utils import textfilter, trim, is_image_element
 from .xml import META_ATTRIBUTES, delete_element
 
 
@@ -226,6 +226,8 @@ def handle_textnode(
     preserve_spaces: bool = False,
 ) -> Optional[_Element]:
     "Convert, format, and probe potential text elements."
+    if elem.tag == "graphic" and is_image_element(elem):
+        return elem
     if elem.tag == "done" or (len(elem) == 0 and not elem.text and not elem.tail):
         return None
 

diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
@@ -331,6 +331,11 @@ def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extr
             #    else:
             #        newsub.tail = processed_child.text
             newsub.text, newsub.tail = processed_child.text, processed_child.tail
+
+            if processed_child.tag == 'graphic':
+                image_elem = handle_image(processed_child)
+                if image_elem is not None:
+                    newsub = image_elem
             processed_element.append(newsub)
         child.tag = "done"
     # finish
@@ -437,8 +442,11 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
     return None
 
 
-def handle_image(element: _Element) -> Optional[_Element]:
+def handle_image(element: Optional[_Element]) -> Optional[_Element]:
     "Process image elements and their relevant attributes."
+    if element is None:
+        return None
+
     processed_element = Element(element.tag)
 
     for attr in ("data-src", "src"):

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -346,6 +346,20 @@ def trim(string: str) -> str:
         return ""
 
 
+def is_image_element(element: _Element) -> bool:
+    '''Check if an element is a valid img element'''
+    for attr in ("data-src", "src"):
+        src = element.get(attr, "")
+        if is_image_file(src):
+            return True
+    else:
+        # take the first corresponding attribute
+        for attr, value in element.attrib.items():
+            if attr.startswith("data-src") and is_image_file(value):
+                return True
+    return False
+
+
 def is_image_file(imagesrc: Optional[str]) -> bool:
     '''Check if the observed string corresponds to a valid image extension.
        Use a length threshold and apply a regex on the content.'''