fix: add get_words_in_area function for OCR

openfoodfacts · Oct 31, 2023 · 2ea5e27 · 2ea5e27
1 parent f459633
commit 2ea5e27
Show file tree

Hide file tree

Showing 2 changed files with 142 additions and 0 deletions.
diff --git a/openfoodfacts/ocr.py b/openfoodfacts/ocr.py
@@ -264,6 +264,20 @@ def get_words_from_indices(
             start_idx, end_idx, raises
         )
 
+    def get_words_in_area(
+        self, bounding_box: Tuple[int, int, int, int]
+    ) -> Optional[List["Word"]]:
+        """Return the list of words that are in the provided area.
+
+        :param bounding_box: a bounding box with absolute coordinates
+        :return: the list of words that are included in `bounding_box` or None
+          if full text annotation is not available
+        """
+        if self.full_text_annotation:
+            return self.full_text_annotation.get_words_in_area(bounding_box)
+
+        return None
+
     def pprint(self):
         """Pretty print the full text annotation, if it is not null."""
         if self.full_text_annotation:
@@ -484,6 +498,19 @@ def _generate_pretty_print_string(self) -> str:
                     strings.append(f"    {repr(text)}")
         return "\n".join(strings)
 
+    def get_words_in_area(
+        self, bounding_box: Tuple[int, int, int, int]
+    ) -> List["Word"]:
+        """Return the list of words that are in the provided area.
+
+        :param bounding_box: a bounding box with absolute coordinates
+        :return: the list of words that are included in `bounding_box`
+        """
+        words = []
+        for page in self.pages:
+            words += page.get_words_in_area(bounding_box)
+        return words
+
 
 class TextAnnotationPage:
     """Detected page from OCR."""
@@ -554,6 +581,19 @@ def get_words_from_indices(
                 break
         return selected, remaining
 
+    def get_words_in_area(
+        self, bounding_box: Tuple[int, int, int, int]
+    ) -> List["Word"]:
+        """Return the list of words of the page that are in the provided area.
+
+        :param bounding_box: a bounding box with absolute coordinates
+        :return: the list of words that are included in `bounding_box`
+        """
+        words = []
+        for block in self.blocks:
+            words += block.get_words_in_area(bounding_box)
+        return words
+
 
 class Block:
     """Logical element on the page."""
@@ -651,6 +691,19 @@ def get_words_from_indices(
                 break
         return selected, remaining
 
+    def get_words_in_area(
+        self, bounding_box: Tuple[int, int, int, int]
+    ) -> List["Word"]:
+        """Return the list of words of the block that are in the provided area.
+
+        :param bounding_box: a bounding box with absolute coordinates
+        :return: the list of words that are included in `bounding_box`
+        """
+        words = []
+        for paragraph in self.paragraphs:
+            words += paragraph.get_words_in_area(bounding_box)
+        return words
+
 
 class Paragraph:
     """Structural unit of text representing a number of words in certain
@@ -728,6 +781,17 @@ def get_words_from_indices(
 
         return selected, remaining
 
+    def get_words_in_area(
+        self, bounding_box: Tuple[int, int, int, int]
+    ) -> List["Word"]:
+        """Return the list of words of the paragraph that are in the provided
+        area.
+
+        :param bounding_box: a bounding box with absolute coordinates
+        :return: the list of words that are included in `bounding_box`
+        """
+        return get_words_in_area(self.words, bounding_box)
+
 
 class Word:
     """A word representation."""
@@ -1004,6 +1068,35 @@ def compute_words_union_bounding_box(words: List[Word]) -> Tuple[int, int, int,
     return (y_min, x_min, y_max, x_max)  # type: ignore
 
 
+def get_words_in_area(
+    words: List[Word], bounding_box: Tuple[int, int, int, int]
+) -> List[Word]:
+    """Return the list of words that are in the provided area.
+
+    :param words: a list of words
+    :param bounding_box: a bounding box with absolute coordinates
+    :return: the list of words that are included in `bounding_box`
+    """
+    bb_y_min, bb_x_min, bb_y_max, bb_x_max = bounding_box
+    selected = []
+    for word in words:
+        vertices = word.bounding_poly.vertices
+        x_min = min(v[0] for v in vertices)
+        y_min = min(v[1] for v in vertices)
+        x_max = max(v[0] for v in vertices)
+        y_max = max(v[1] for v in vertices)
+
+        if (
+            x_min >= bb_x_min
+            and x_max <= bb_x_max
+            and y_min >= bb_y_min
+            and y_max <= bb_y_max
+        ):
+            selected.append(word)
+
+    return selected
+
+
 class OCRTextAnnotation:
     __slots__ = ("locale", "text", "bounding_poly")
 

diff --git a/tests/test_ocr.py b/tests/test_ocr.py
@@ -0,0 +1,49 @@
+from typing import Optional
+
+import pytest
+
+from openfoodfacts.ocr import OCRResult
+
+
+@pytest.mark.parametrize(
+    "ocr_url, bounding_box, expected_text",
+    [
+        (
+            # It corresponds to this OCR crop: https://robotoff.openfoodfacts.org/api/v1/images/crop?image_url=https://images.openfoodfacts.org/images/products/089/000/000/1202/1.jpg&y_min=0.08416666666666667&x_min=0.30077691453940064&y_max=0.09583333333333334&x_max=0.37735849056603776
+            "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/0890000001202_1.json",
+            [101, 271, 115, 340],
+            "Materne",
+        ),
+        (
+            # same, but the bounding box is distinct from the logo area
+            "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/0890000001202_1.json",
+            [120, 271, 134, 340],
+            None,
+        ),
+        (
+            # same, but the bounding box is distinct from the logo area
+            "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/0890000001202_1.json",
+            [120, 271, 134, 340],
+            None,
+        ),
+        (
+            # [0.2808293402194977,0.37121888995170593,0.35544055700302124,0.49409016966819763]
+            # /540/091/030/1160/1.jpg
+            "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/5400910301160_1.json",
+            [337, 327, 427, 436],
+            "NUTRIDIA",
+        ),
+    ],
+)
+def test_get_words_in_area(
+    ocr_url: str, bounding_box: list[int, int, int, int], expected_text: Optional[str]
+):
+    ocr_result = OCRResult.from_url(ocr_url)
+    words = ocr_result.get_words_in_area(bounding_box)
+
+    if expected_text is None:
+        assert words == []
+    else:
+        assert words is not None
+        assert len(words) == 1
+        assert words[0].text.strip() == expected_text