diff --git a/CHANGELOG.md b/CHANGELOG.md index f43c7976..9eb44fc4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.11 + +* Fixed some cases where image elements were not being OCR'd + ## 0.2.10 * Removed control characters from tesseract output diff --git a/scripts/version-sync.sh b/scripts/version-sync.sh index 3b3e17bf..e0e873c5 100755 --- a/scripts/version-sync.sh +++ b/scripts/version-sync.sh @@ -16,7 +16,7 @@ done # Version appearing earliest in CHANGELOGFILE will be used as ground truth. CHANGELOGFILE="CHANGELOG.md" VERSIONFILE="unstructured_inference/__version__.py" -RE_SEMVER_FULL="(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(-((0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?" +RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?" # Pull out semver appearing earliest in CHANGELOGFILE. LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$CHANGELOGFILE") diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index e1cd08d7..251ee4cb 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -230,6 +230,9 @@ class MockLayout: def __init__(self, *elements): self.elements = elements + def __len__(self): + return len(self.elements) + def sort(self, key, inplace): return self.elements @@ -239,6 +242,9 @@ def __iter__(self): def get_texts(self): return [el.text for el in self.elements] + def filter_by(self, *args, **kwargs): + return MockLayout() + @pytest.mark.parametrize( "block_text, layout_texts, expected_text", @@ -334,3 +340,9 @@ def test_invalid_ocr_strategy_raises(mock_image): ) def test_remove_control_characters(text, expected): assert layout.remove_control_characters(text) == expected + + +def test_interpret_called_when_filter_empty(mock_image): + with patch("unstructured_inference.inference.layout.interpret_text_block"): + layout.aggregate_by_block(MockTextBlock(), mock_image, MockLayout()) + layout.interpret_text_block.assert_called_once() diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 47aedffe..ac745293 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.2.10" # pragma: no cover +__version__ = "0.2.11" # pragma: no cover diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index ba13eff9..a3430cb8 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -298,6 +298,13 @@ def aggregate_by_block( """Extracts the text aggregated from the elements of the given layout that lie within the given block.""" filtered_blocks = layout.filter_by(text_block, center=True) + # NOTE(alan): For now, if none of the elements discovered by layoutparser are in the block + # we can try interpreting the whole block. This still doesn't handle edge cases, like when there + # are some text elements within the block, but there are image elements overlapping the block + # with text lying within the block. In this case the text in the image would likely be ignored. + if not filtered_blocks: + text = interpret_text_block(text_block, image, ocr_strategy) + return text for little_block in filtered_blocks: little_block.text = interpret_text_block(little_block, image, ocr_strategy) text = " ".join([x for x in filtered_blocks.get_texts() if x])