robocorp · mmokko · Feb 7, 2024 · Jan 31, 2024 · Feb 6, 2024 · Feb 6, 2024
diff --git a/ExtendedPDF.py b/ExtendedPDF.py
diff --git a/PDFMinerLibrary.py b/PDFMinerLibrary.py
@@ -0,0 +1,47 @@
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextBoxHorizontal
+
+from typing import List
+
+def find_row(pdf_path: str, search_text: str) -> List[str]:
+    """
+    Find all elements from the same row by matching the coordinates.
+    """
+    for page_layout in extract_pages(pdf_path):
+        horizontal_text_box_elements = [element for element in page_layout if isinstance(element, LTTextBoxHorizontal)]
+        search_elements = [element for element in horizontal_text_box_elements if search_text in element.get_text()]
+        for search_element in search_elements:
+            x0, y0, _, y1 = search_element.bbox
+            row_elements = []
+            # match all elements
+            for element in horizontal_text_box_elements:
+                # add the element we are using to search
+                if element == search_element:
+                    row_elements.append(element.get_text().strip())
+                    continue
+                ex0, ey0, _, ey1 = element.bbox
+                # Check if the element is at the same y-coordinate and after the the search element in the x-coordinate
+                if (ey0 == y0 and ey1 == y1 and ex0 > x0):
+                    row_elements.append(element.get_text().strip())
+            return row_elements
+
+def find_column(pdf_path: str, search_text: str):
+    columns_elements = []
+    for page_layout in extract_pages(pdf_path):
+        horizontal_text_box_elements = [element for element in page_layout if isinstance(element, LTTextBoxHorizontal)]
+        search_elements = [element for element in horizontal_text_box_elements if search_text in element.get_text()]
+        for search_element in search_elements:
+            x0, ey0, x1, _ = search_element.bbox
+            # match all elements
+            for element in horizontal_text_box_elements:
+                # add the element we are using to search
+                if element == search_element:
+                    columns_elements.append(element.get_text().strip())
+                    continue
+                ex0, y0, ex1, _ = element.bbox
+                # Check if the element is at the same x-coordinate (give or take)
+                if (ex0 >= x0 and ex1 <= (x1 + 1) and ey0 > y0):
+                    columns_elements.append(element.get_text().strip())
+            return columns_elements
+
+
diff --git a/PyPDFLibrary.py b/PyPDFLibrary.py
@@ -0,0 +1,54 @@
+import io
+import re
+from typing import Any, Callable, Dict, List, TypeVar
+
+from pypdf import PdfReader
+
+T = TypeVar("T", bound=Callable[..., Any])
+
+
+class PyPDFLibrary:
+    def __init__(self):
+        self._fh: io.FileIO | None = None
+        self._reader: PdfReader | None = None
+
+    def _validate_reader(func: T) -> T:
+        def wrapper(self: "PyPDFLibrary", *args: Any, **kwargs: Any):
+            if not self._reader:
+                raise ValueError("Open PDF file first")
+            return func(self, *args, **kwargs)
+
+        return wrapper
+
+    def open_pdf(self, file_path: str) -> None:
+        self._fh = open(file_path, "rb")
+        self._reader = PdfReader(self._fh)
+
+    @_validate_reader
+    def parse_text(self) -> Dict[int, str]:
+        pages = {}
+        for page_index, page in enumerate(self._reader.pages):
+            pages[page_index] = page.extract_text()
+        return pages
+
+    @staticmethod
+    def flatten(nested: List[any]) -> List[str]:
+        return [
+            element
+            for sublist in nested
+            for element in (sublist if isinstance(sublist, tuple) else (sublist,))
+        ]
+
+    @_validate_reader
+    def find_matches(self, pattern: re.Pattern) -> List[str]:
+        matches = []
+        for page in self._reader.pages:
+            text = page.extract_text()
+            match = re.findall(pattern, text, re.MULTILINE)
+            if match:
+                matches.extend(self.flatten(match))
+        return matches
+
+    def close(self) -> None:
+        if self._fh:
+            self._fh.close()
diff --git a/README.md b/README.md
@@ -9,32 +9,13 @@ information out of PDF files.
 
 ## Tasks
 
-### `Extract Text Data With RPA`
+### `Extract Text Data`
 
-Extract textual data with the local help of `RPA.PDF` library.
-
-Watch demo: https://www.loom.com/share/4bc28045f45941d18d935a60e35e227a
+Extract textual data from PDF file.
 
 > Usually this is sufficient for most of the cases.
 
 
-### `Extract Tabular Data With Camelot`
-
-Extract tables with the Camelot library. (see external
-[dependency](https://pypi.org/project/camelot-py/))
-
-Watch demo: https://www.loom.com/share/87d434dad8d748ada566a8c199cbea90
-
-> This is useful for getting out nicely formatted tabular data, but comes at the cost
-> of heavier dependencies brought in the built environment.
-
-### `Extract Structured Data With AI`
-
-Extract fields detected in both text or image-based PDFs using 3rd-party external
-services wrapped by the `RPA.DocumentAI` library.
-
-Watch demo: https://www.loom.com/share/e9753b884b6f4aa1ac0271580a0cf682
+### `Extract element from table in PDF`
 
-> When all the options above fail (or provide inaccurate data), it is time to employ a
-> Machine Learning model specially trained to detect and structure fields of interest
-> from the provided input file, be it text-based or even image.
+In some cases it may be easier to find the elements and it's neighbours instead of just parsing the text. In this example we find rows and columns from a table in a PDF document.
diff --git a/conda.yaml b/conda.yaml
@@ -8,10 +8,10 @@ channels:
 dependencies:
   # Define conda-forge packages here -> https://anaconda.org/search
   # When available, prefer the conda-forge packages over pip as installations are more efficient.
-  - python=3.9.13               # https://pyreadiness.org/3.9/
-  - camelot-py=0.10.1
-  - pip=22.1.2                  # https://pip.pypa.io/en/stable/news/
+  - python=3.10.13               # https://pyreadiness.org/3.9/
+  - pip=23.3.2                  # https://pip.pypa.io/en/stable/news/
   - pip:
       # Define pip packages here -> https://pypi.org/
-      - rpaframework==21.0.0    # https://rpaframework.org/releasenotes.html
-      - camelot-py[base]==0.10.1
+      - robocorp==1.6.1
+      - pypdf==4.0.1
+      - pdfminer.six==20231228
diff --git a/robot.yaml b/robot.yaml
@@ -2,13 +2,10 @@
 # https://github.com/robocorp/rcc/blob/master/docs/recipes.md#what-is-in-robotyaml
 
 tasks:
-  # Task names here are used when executing the bots, so renaming these is recommended.
-  Extract Text Data With RPA:
-    robotTaskName: Extract Text Data With RPA
-  Extract Tabular Data With Camelot:
-    robotTaskName: Extract Tabular Data With Camelot
-  Extract Structured Data With AI:
-    robotTaskName: Extract Structured Data With AI
+  Extract Text Data:
+    shell: python -m robocorp.tasks run -t extract_text_data
+  Extract Elements:
+    shell: python -m robocorp.tasks run -t extract_elements
 
 condaConfigFile: conda.yaml
 

diff --git a/tasks.py b/tasks.py
@@ -0,0 +1,81 @@
+import os
+
+from robocorp import log
+from robocorp.tasks import task
+
+from PyPDFLibrary import PyPDFLibrary
+from PDFMinerLibrary import find_row, find_column
+
+PDF_INVOICE_FILE_PATH = os.path.join("devdata", "text-invoice.pdf")
+PDF_INVOICE_TABLE_FILE_PATH = os.path.join("devdata", "text-invoice-table.pdf")
+
+EXCEL_FILE_STARTING_ROW_INDEX = 3
+
+
+def print_all_text(pdf_lib: PyPDFLibrary) -> None:
+    text_from_all_pages = pdf_lib.parse_text()
+    log.info(text_from_all_pages)
+
+
+def find_service_description(pdf_lib: PyPDFLibrary) -> None:
+    pattern = r".*Service.*\n.{4}(.*)\n(.+?\.\.\.)"
+    matches = pdf_lib.find_matches(pattern)
+    assert len(matches) > 0, f"Text could not be found for: {pattern}"
+    text = "\n".join(matches)
+    log.info(f"First item under service: {text}")
+
+
+def find_web_design_service_value(pdf_lib: PyPDFLibrary) -> None:
+    pattern = r".*Service.*\n.*Web Design.*\n.*?\$(.{5}) (.{5}) (.{6})"
+    matches = pdf_lib.find_matches(pattern)
+    assert len(matches) >= 3, f"Unexpected match found: {matches}"
+    log.info(
+        f'The "Adjust" column value to the right of the "Web Design" "Service": {matches[1]}'
+    )
+
+
+def find_prices(pdf_lib: PyPDFLibrary) -> None:
+    pattern = r"Sub Total (.*)\nTax (.*)\nTotal (.*)"
+    matches = pdf_lib.find_matches(pattern)
+    assert len(matches) > 0, f"No matches found for pattern: {pattern}"
+    log.info(f'Next 3 prices below "Sub Total": {matches}')
+
+
+def find_lines_with_email_addresses(pdf_lib: PyPDFLibrary) -> None:
+    pattern = r"^.*\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b.*$"
+    matches = pdf_lib.find_matches(pattern)
+    assert len(matches) > 0, f"No matches found for pattern: {pattern}"
+    log.info(f"Show all the lines containing an e-mail address: {matches}")
+
+
+def print_values_from_row() -> None:
+    elements = find_row(PDF_INVOICE_TABLE_FILE_PATH, 'Test Item')
+    log.info(f"Elements in searched row: {elements}")
+
+
+def print_values_from_column() -> None:
+    column = find_column(PDF_INVOICE_TABLE_FILE_PATH, 'Total gross')
+    log.info(f"Column: {column}")
+
+
+@task
+def extract_text_data() -> None:
+    pdf_lib = PyPDFLibrary()
+
+    try:
+        pdf_lib.open_pdf(PDF_INVOICE_FILE_PATH)
+        print_all_text(pdf_lib)
+        find_service_description(pdf_lib)
+        find_web_design_service_value(pdf_lib)
+        find_prices(pdf_lib)
+        find_lines_with_email_addresses(pdf_lib)
+    finally:
+        pdf_lib.close()
+
+
+@task
+def extract_elements_from_table() -> None:
+    print_values_from_row()
+    print_values_from_column()
+
+