Skip to content

Commit

Permalink
fix: removing control characters (#66)
Browse files Browse the repository at this point in the history
Tesseract is putting some control characters in out_text, this commit just delete all of them

* Style correction

* Version sync
  • Loading branch information
benjats07 authored Mar 7, 2023
1 parent d332b65 commit 237d69d
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.2.10

* Removed control characters from tesseract output

## 0.2.9

* Removed multithreading from OCR (DocumentLayout.get_elements_from_layout)
Expand Down
7 changes: 7 additions & 0 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,3 +327,10 @@ def test_from_file_fixed_layout(fixed_layouts, called_method, not_called_method)
def test_invalid_ocr_strategy_raises(mock_image):
with pytest.raises(ValueError):
layout.PageLayout(0, mock_image, MockLayout(), ocr_strategy="fake_strategy")


@pytest.mark.parametrize(
("text", "expected"), [("a\ts\x0cd\nfas\fd\rf\b", "asdfasdf"), ("\"'\\", "\"'\\")]
)
def test_remove_control_characters(text, expected):
assert layout.remove_control_characters(text) == expected
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.9" # pragma: no cover
__version__ = "0.2.10" # pragma: no cover
9 changes: 8 additions & 1 deletion unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import tempfile
from tqdm import tqdm
from typing import List, Optional, Tuple, Union, BinaryIO

import unicodedata
from layoutparser.io.pdf import load_pdf
from layoutparser.elements.layout_elements import TextBlock
from layoutparser.elements.layout import Layout
Expand Down Expand Up @@ -318,6 +318,7 @@ def interpret_text_block(
out_text = ocr(text_block, image)
else:
out_text = "" if text_block.text is None else text_block.text
out_text = remove_control_characters(out_text)
return out_text


Expand All @@ -329,3 +330,9 @@ def ocr(text_block: TextBlock, image: Image.Image) -> str:
padded_block = text_block.pad(left=5, right=5, top=5, bottom=5)
cropped_image = padded_block.crop_image(image_array)
return tesseract.ocr_agent.detect(cropped_image)


def remove_control_characters(text: str) -> str:
"""Removes control characters from text."""
out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C")
return out_text

0 comments on commit 237d69d

Please sign in to comment.