Skip to content

Commit

Permalink
refactor: remove code related to embedded text extraction (#349)
Browse files Browse the repository at this point in the history
This PR removes all code related to filling inferred elements text from
embedded text (`pdfminer`). This PR is the first part of moving embedded
text related code from `unstructured-inference` to `unstructured` and
works together with
Unstructured-IO/unstructured#3061.
  • Loading branch information
christinestraub authored May 21, 2024
1 parent 76619ca commit 81549a7
Show file tree
Hide file tree
Showing 10 changed files with 7 additions and 234 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## 0.7.32-dev1
## 0.7.32

* refactor: remove all code related to filling inferred elements text from embedded text (pdfminer).
* bug: set the Chipper max_length variable

## 0.7.31
Expand Down
50 changes: 0 additions & 50 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,16 +312,6 @@ def test_from_image_file_raises_isadirectoryerror_with_dir():
layout.DocumentLayout.from_image_file(tempdir)


@pytest.mark.parametrize("idx", range(2))
def test_get_elements_from_layout(mock_initial_layout, idx):
page = MockPageLayout()
block = mock_initial_layout[idx]
block.bbox.pad(3)
fixed_layout = [block]
elements = page.get_elements_from_layout(fixed_layout)
assert elements[0].text == block.text


def test_page_numbers_in_page_objects():
with patch(
"unstructured_inference.inference.layout.PageLayout.get_elements_with_detection_model",
Expand All @@ -331,40 +321,6 @@ def test_page_numbers_in_page_objects():
assert [page.number for page in doc.pages] == list(range(1, len(doc.pages) + 1))


@pytest.mark.parametrize(
("fixed_layouts", "called_method", "not_called_method"),
[
(
[MockLayout()],
"get_elements_from_layout",
"get_elements_with_detection_model",
),
(None, "get_elements_with_detection_model", "get_elements_from_layout"),
],
)
def test_from_file_fixed_layout(fixed_layouts, called_method, not_called_method):
with patch.object(
layout.PageLayout,
"get_elements_with_detection_model",
return_value=[],
), patch.object(
layout.PageLayout,
"get_elements_from_layout",
return_value=[],
):
layout.DocumentLayout.from_file("sample-docs/loremipsum.pdf", fixed_layouts=fixed_layouts)
getattr(layout.PageLayout, called_method).assert_called()
getattr(layout.PageLayout, not_called_method).assert_not_called()


@pytest.mark.parametrize(
("text", "expected"),
[("c\to\x0cn\ftrol\ncharacter\rs\b", "control characters"), ("\"'\\", "\"'\\")],
)
def test_remove_control_characters(text, expected):
assert elements.remove_control_characters(text) == expected


no_text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100)
text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100, text="test")
overlapping_rect = ImageTextRegion.from_coords(50, 50, 150, 150)
Expand Down Expand Up @@ -417,12 +373,6 @@ def check_annotated_image():
check_annotated_image()


@pytest.mark.parametrize(("text", "expected"), [("asdf", "asdf"), (None, "")])
def test_embedded_text_region(text, expected):
etr = elements.EmbeddedTextRegion.from_coords(0, 0, 24, 24, text=text)
assert etr.extract_text(objects=None) == expected


class MockDetectionModel(layout.UnstructuredObjectDetectionModel):
def initialize(self, *args, **kwargs):
pass
Expand Down
12 changes: 0 additions & 12 deletions test_unstructured_inference/inference/test_layout_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,6 @@
from unstructured_inference.inference.layoutelement import LayoutElement, TextRegion


def test_layout_element_extract_text(
mock_layout_element,
mock_text_region,
):
extracted_text = mock_layout_element.extract_text(
objects=[mock_text_region],
)

assert isinstance(extracted_text, str)
assert "Sample text" in extracted_text


def test_layout_element_do_dict(mock_layout_element):
expected = {
"coordinates": ((100, 100), (100, 300), (300, 300), (300, 100)),
Expand Down
13 changes: 0 additions & 13 deletions test_unstructured_inference/test_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,16 +272,3 @@ def test_merge_inferred_layout_with_extracted_layout():
assert merged_layout[0].text == "Example Section Header"
assert merged_layout[1].type == ElementType.TEXT
assert merged_layout[1].text == "Example Title"


def test_aggregate_by_block():
expected = "Inside region1 Inside region2"
embedded_regions = [
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
]
target_region = TextRegion.from_coords(0, 0, 300, 300)

text = elements.aggregate_by_block(target_region, embedded_regions)
assert text == expected
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.32-dev1" # pragma: no cover
__version__ = "0.7.32" # pragma: no cover
10 changes: 0 additions & 10 deletions unstructured_inference/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,6 @@ def LAYOUT_SUBREGION_THRESHOLD(self) -> float:
"""
return self._get_float("LAYOUT_SUBREGION_THRESHOLD", 0.75)

@property
def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float:
"""threshold to determine if an embedded region is a sub-region of a given block
when aggregating the text from embedded elements that lie within the given block
When the intersection region area divided by self area is larger than this threshold self is
considered a subregion of the other
"""
return self._get_float("EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD", 0.99)

@property
def ELEMENTS_H_PADDING_COEF(self) -> float:
"""When extending the boundaries of a PDF object for the purpose of determining which other
Expand Down
66 changes: 3 additions & 63 deletions unstructured_inference/inference/elements.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from __future__ import annotations

import unicodedata
from copy import deepcopy
from dataclasses import dataclass
from typing import Collection, Optional, Union
from typing import Optional, Union

import numpy as np

from unstructured_inference.config import inference_config
from unstructured_inference.constants import Source
from unstructured_inference.math import safe_division

Expand Down Expand Up @@ -184,21 +182,6 @@ class TextRegion:
def __str__(self) -> str:
return str(self.text)

def extract_text(
self,
objects: Optional[Collection[TextRegion]],
) -> str:
"""Extracts text contained in region."""
if self.text is not None:
# If block text is already populated, we'll assume it's correct
text = self.text
elif objects is not None:
text = aggregate_by_block(self, objects)
else:
text = ""
cleaned_text = remove_control_characters(text)
return cleaned_text

@classmethod
def from_coords(
cls,
Expand All @@ -217,54 +200,11 @@ def from_coords(


class EmbeddedTextRegion(TextRegion):
def extract_text(
self,
objects: Optional[Collection[TextRegion]],
) -> str:
"""Extracts text contained in region."""
if self.text is None:
return ""
else:
return self.text
pass


class ImageTextRegion(TextRegion):
def extract_text(
self,
objects: Optional[Collection[TextRegion]],
) -> str:
"""Extracts text contained in region."""
if self.text is None:
return ""
else:
return super().extract_text(objects)


def aggregate_by_block(
text_region: TextRegion,
pdf_objects: Collection[TextRegion],
) -> str:
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""

subregion_threshold = inference_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
filtered_blocks = [
obj
for obj in pdf_objects
if obj.bbox.is_almost_subregion_of(text_region.bbox, subregion_threshold)
]
text = " ".join([x.text for x in filtered_blocks if x.text])
return text


def remove_control_characters(text: str) -> str:
"""Removes control characters from text."""

# Replace newline character with a space
text = text.replace("\n", " ")
# Remove other control characters
out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C")
return out_text
pass


def region_bounding_boxes_are_almost_the_same(
Expand Down
40 changes: 1 addition & 39 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
from unstructured_inference.inference.layoutelement import (
LayoutElement,
)
from unstructured_inference.inference.ordering import order_layout
from unstructured_inference.logger import logger
from unstructured_inference.models.base import get_model
from unstructured_inference.models.chipper import UnstructuredChipperModel
from unstructured_inference.models.unstructuredmodel import (
UnstructuredElementExtractionModel,
UnstructuredObjectDetectionModel,
Expand Down Expand Up @@ -201,29 +199,6 @@ def get_elements_with_detection_model(

return inferred_layout

def get_elements_from_layout(
self,
layout: List[TextRegion],
pdf_objects: Optional[List[TextRegion]] = None,
) -> List[LayoutElement]:
"""Uses the given Layout to separate the page text into elements, either extracting the
text from the discovered layout blocks."""

# If the model is a chipper model, we don't want to order the
# elements, as they are already ordered
order_elements = not isinstance(self.detection_model, UnstructuredChipperModel)
if order_elements:
layout = order_layout(layout)

elements = [
get_element_from_block(
block=e,
pdf_objects=pdf_objects,
)
for e in layout
]
return elements

def _get_image_array(self) -> Union[np.ndarray, None]:
"""Converts the raw image into a numpy array."""
if self.image_array is None:
Expand Down Expand Up @@ -330,7 +305,7 @@ def from_image(
elif fixed_layout is None:
page.get_elements_with_detection_model()
else:
page.elements = page.get_elements_from_layout(fixed_layout)
page.elements = []

page.image_metadata = {
"format": page.image.format if page.image else None,
Expand Down Expand Up @@ -405,19 +380,6 @@ def process_file_with_model(
return layout


def get_element_from_block(
block: TextRegion,
pdf_objects: Optional[List[TextRegion]] = None,
) -> LayoutElement:
"""Creates a LayoutElement from a given layout or image by finding all the text that lies within
a given block."""
element = block if isinstance(block, LayoutElement) else LayoutElement.from_region(block)
element.text = element.extract_text(
objects=pdf_objects,
)
return element


def convert_pdf_to_image(
filename: str,
dpi: int = 200,
Expand Down
10 changes: 0 additions & 10 deletions unstructured_inference/inference/layoutelement.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,6 @@ class LayoutElement(TextRegion):
image_path: Optional[str] = None
parent: Optional[LayoutElement] = None

def extract_text(
self,
objects: Optional[Collection[TextRegion]],
):
"""Extracts text contained in region"""
text = super().extract_text(
objects=objects,
)
return text

def to_dict(self) -> dict:
"""Converts the class instance to dictionary form."""
out_dict = {
Expand Down
35 changes: 0 additions & 35 deletions unstructured_inference/inference/ordering.py

This file was deleted.

0 comments on commit 81549a7

Please sign in to comment.