Skip to content

Commit

Permalink
extraction fix: images in text nodes (#757)
Browse files Browse the repository at this point in the history
* refine table markdown output

* fix ut

* extract image in textnode

---------

Co-authored-by: CodyInnowhere <[email protected]>
  • Loading branch information
unsleepy22 and CodyInnowhere authored Dec 3, 2024
1 parent 4e59c8a commit b7bfcc3
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 2 deletions.
7 changes: 7 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,7 @@ def test_images():
assert is_image_file('test.txt') is False
assert is_image_file('test.jpg'*2000) is False # length threshold
# tag with attributes
assert handle_image(None) is None
assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None
assert handle_image(html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>')) is not None
assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None
Expand All @@ -494,6 +495,12 @@ def test_images():
assert '![Example image](test.jpg)' in extract(teststring, include_images=True, fast=True)
assert '<graphic src="test.jpg" title="Example image"/>' in extract(teststring, include_images=True, fast=True, output_format='xml', config=ZERO_CONFIG)
assert extract('<html><body><article><img data-src="test.jpg" alt="text" title="a title"/></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
assert extract('<html><body><article><p><img data-src="test.jpg" alt="text" title="a title"/></p></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
assert extract('<html><body><article><p><img other="test.jpg" alt="text" title="a title"/></p></article></body></html>', include_images=True, fast=True) == ''
assert extract('<html><body><article><div><p><img data-src="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
assert extract('<html><body><article><div><p><img data-src-small="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'

assert handle_image(html.fromstring('<img src="" alt="text"></img>')) is None

# CNN example
mydoc = html.fromstring('<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781" src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-eq-state="mini xsmall small medium" data-src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg">')
Expand Down
4 changes: 3 additions & 1 deletion trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
MANUALLY_CLEANED,
MANUALLY_STRIPPED,
)
from .utils import textfilter, trim
from .utils import textfilter, trim, is_image_element
from .xml import META_ATTRIBUTES, delete_element


Expand Down Expand Up @@ -226,6 +226,8 @@ def handle_textnode(
preserve_spaces: bool = False,
) -> Optional[_Element]:
"Convert, format, and probe potential text elements."
if elem.tag == "graphic" and is_image_element(elem):
return elem
if elem.tag == "done" or (len(elem) == 0 and not elem.text and not elem.tail):
return None

Expand Down
10 changes: 9 additions & 1 deletion trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,11 @@ def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extr
# else:
# newsub.tail = processed_child.text
newsub.text, newsub.tail = processed_child.text, processed_child.tail

if processed_child.tag == 'graphic':
image_elem = handle_image(processed_child)
if image_elem is not None:
newsub = image_elem
processed_element.append(newsub)
child.tag = "done"
# finish
Expand Down Expand Up @@ -437,8 +442,11 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
return None


def handle_image(element: _Element) -> Optional[_Element]:
def handle_image(element: Optional[_Element]) -> Optional[_Element]:
"Process image elements and their relevant attributes."
if element is None:
return None

processed_element = Element(element.tag)

for attr in ("data-src", "src"):
Expand Down
14 changes: 14 additions & 0 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,20 @@ def trim(string: str) -> str:
return ""


def is_image_element(element: _Element) -> bool:
'''Check if an element is a valid img element'''
for attr in ("data-src", "src"):
src = element.get(attr, "")
if is_image_file(src):
return True
else:
# take the first corresponding attribute
for attr, value in element.attrib.items():
if attr.startswith("data-src") and is_image_file(value):
return True
return False


def is_image_file(imagesrc: Optional[str]) -> bool:
'''Check if the observed string corresponds to a valid image extension.
Use a length threshold and apply a regex on the content.'''
Expand Down

0 comments on commit b7bfcc3

Please sign in to comment.