Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert to python #1

Merged
merged 6 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 0 additions & 34 deletions ExtendedPDF.py

This file was deleted.

47 changes: 47 additions & 0 deletions PDFMinerLibrary.py
mmokko marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal

from typing import List

def find_row(pdf_path: str, search_text: str) -> List[str]:
"""
Find all elements from the same row by matching the coordinates.
"""
for page_layout in extract_pages(pdf_path):
horizontal_text_box_elements = [element for element in page_layout if isinstance(element, LTTextBoxHorizontal)]
search_elements = [element for element in horizontal_text_box_elements if search_text in element.get_text()]
for search_element in search_elements:
x0, y0, _, y1 = search_element.bbox
row_elements = []
# match all elements
for element in horizontal_text_box_elements:
# add the element we are using to search
if element == search_element:
row_elements.append(element.get_text().strip())
continue
ex0, ey0, _, ey1 = element.bbox
# Check if the element is at the same y-coordinate and after the the search element in the x-coordinate
if (ey0 == y0 and ey1 == y1 and ex0 > x0):
row_elements.append(element.get_text().strip())
return row_elements
mmokko marked this conversation as resolved.
Show resolved Hide resolved

def find_column(pdf_path: str, search_text: str):
columns_elements = []
for page_layout in extract_pages(pdf_path):
horizontal_text_box_elements = [element for element in page_layout if isinstance(element, LTTextBoxHorizontal)]
search_elements = [element for element in horizontal_text_box_elements if search_text in element.get_text()]
for search_element in search_elements:
x0, ey0, x1, _ = search_element.bbox
# match all elements
for element in horizontal_text_box_elements:
# add the element we are using to search
if element == search_element:
columns_elements.append(element.get_text().strip())
continue
ex0, y0, ex1, _ = element.bbox
# Check if the element is at the same x-coordinate (give or take)
if (ex0 >= x0 and ex1 <= (x1 + 1) and ey0 > y0):
columns_elements.append(element.get_text().strip())
return columns_elements


54 changes: 54 additions & 0 deletions PyPDFLibrary.py
mmokko marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import io
import re
from typing import Any, Callable, Dict, List, TypeVar

from pypdf import PdfReader

T = TypeVar("T", bound=Callable[..., Any])


class PyPDFLibrary:
def __init__(self):
self._fh: io.FileIO | None = None
self._reader: PdfReader | None = None

def _validate_reader(func: T) -> T:
def wrapper(self: "PyPDFLibrary", *args: Any, **kwargs: Any):
if not self._reader:
raise ValueError("Open PDF file first")
return func(self, *args, **kwargs)

return wrapper

def open_pdf(self, file_path: str) -> None:
self._fh = open(file_path, "rb")
self._reader = PdfReader(self._fh)

@_validate_reader
def parse_text(self) -> Dict[int, str]:
pages = {}
for page_index, page in enumerate(self._reader.pages):
pages[page_index] = page.extract_text()
return pages

@staticmethod
def flatten(nested: List[any]) -> List[str]:
return [
element
for sublist in nested
for element in (sublist if isinstance(sublist, tuple) else (sublist,))
]

@_validate_reader
def find_matches(self, pattern: re.Pattern) -> List[str]:
matches = []
for page in self._reader.pages:
text = page.extract_text()
match = re.findall(pattern, text, re.MULTILINE)
if match:
matches.extend(self.flatten(match))
return matches

def close(self) -> None:
if self._fh:
self._fh.close()
27 changes: 4 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,13 @@ information out of PDF files.

## Tasks

### `Extract Text Data With RPA`
### `Extract Text Data`

Extract textual data with the local help of `RPA.PDF` library.

Watch demo: https://www.loom.com/share/4bc28045f45941d18d935a60e35e227a
Extract textual data from PDF file.
mmokko marked this conversation as resolved.
Show resolved Hide resolved

> Usually this is sufficient for most of the cases.


### `Extract Tabular Data With Camelot`

Extract tables with the Camelot library. (see external
[dependency](https://pypi.org/project/camelot-py/))

Watch demo: https://www.loom.com/share/87d434dad8d748ada566a8c199cbea90

> This is useful for getting out nicely formatted tabular data, but comes at the cost
> of heavier dependencies brought in the built environment.

### `Extract Structured Data With AI`

Extract fields detected in both text or image-based PDFs using 3rd-party external
services wrapped by the `RPA.DocumentAI` library.

Watch demo: https://www.loom.com/share/e9753b884b6f4aa1ac0271580a0cf682
### `Extract element from table in PDF`
mmokko marked this conversation as resolved.
Show resolved Hide resolved

> When all the options above fail (or provide inaccurate data), it is time to employ a
> Machine Learning model specially trained to detect and structure fields of interest
> from the provided input file, be it text-based or even image.
In some cases it may be easier to find the elements and it's neighbours instead of just parsing the text. In this example we find rows and columns from a table in a PDF document.
mmokko marked this conversation as resolved.
Show resolved Hide resolved
10 changes: 5 additions & 5 deletions conda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ channels:
dependencies:
# Define conda-forge packages here -> https://anaconda.org/search
# When available, prefer the conda-forge packages over pip as installations are more efficient.
- python=3.9.13 # https://pyreadiness.org/3.9/
- camelot-py=0.10.1
- pip=22.1.2 # https://pip.pypa.io/en/stable/news/
- python=3.10.13 # https://pyreadiness.org/3.9/
- pip=23.3.2 # https://pip.pypa.io/en/stable/news/
mmokko marked this conversation as resolved.
Show resolved Hide resolved
- pip:
# Define pip packages here -> https://pypi.org/
- rpaframework==21.0.0 # https://rpaframework.org/releasenotes.html
- camelot-py[base]==0.10.1
- robocorp==1.6.1
- pypdf==4.0.1
- pdfminer.six==20231228
11 changes: 4 additions & 7 deletions robot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,10 @@
# https://github.com/robocorp/rcc/blob/master/docs/recipes.md#what-is-in-robotyaml

tasks:
# Task names here are used when executing the bots, so renaming these is recommended.
Extract Text Data With RPA:
robotTaskName: Extract Text Data With RPA
Extract Tabular Data With Camelot:
robotTaskName: Extract Tabular Data With Camelot
Extract Structured Data With AI:
robotTaskName: Extract Structured Data With AI
Extract Text Data:
shell: python -m robocorp.tasks run -t extract_text_data
Extract Elements:
shell: python -m robocorp.tasks run -t extract_elements

condaConfigFile: conda.yaml

Expand Down
81 changes: 81 additions & 0 deletions tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import os

from robocorp import log
from robocorp.tasks import task

from PyPDFLibrary import PyPDFLibrary
from PDFMinerLibrary import find_row, find_column

PDF_INVOICE_FILE_PATH = os.path.join("devdata", "text-invoice.pdf")
PDF_INVOICE_TABLE_FILE_PATH = os.path.join("devdata", "text-invoice-table.pdf")
mmokko marked this conversation as resolved.
Show resolved Hide resolved

EXCEL_FILE_STARTING_ROW_INDEX = 3


def print_all_text(pdf_lib: PyPDFLibrary) -> None:
text_from_all_pages = pdf_lib.parse_text()
log.info(text_from_all_pages)


def find_service_description(pdf_lib: PyPDFLibrary) -> None:
pattern = r".*Service.*\n.{4}(.*)\n(.+?\.\.\.)"
matches = pdf_lib.find_matches(pattern)
assert len(matches) > 0, f"Text could not be found for: {pattern}"
text = "\n".join(matches)
log.info(f"First item under service: {text}")


def find_web_design_service_value(pdf_lib: PyPDFLibrary) -> None:
pattern = r".*Service.*\n.*Web Design.*\n.*?\$(.{5}) (.{5}) (.{6})"
matches = pdf_lib.find_matches(pattern)
assert len(matches) >= 3, f"Unexpected match found: {matches}"
log.info(
f'The "Adjust" column value to the right of the "Web Design" "Service": {matches[1]}'
)


def find_prices(pdf_lib: PyPDFLibrary) -> None:
pattern = r"Sub Total (.*)\nTax (.*)\nTotal (.*)"
matches = pdf_lib.find_matches(pattern)
assert len(matches) > 0, f"No matches found for pattern: {pattern}"
log.info(f'Next 3 prices below "Sub Total": {matches}')


def find_lines_with_email_addresses(pdf_lib: PyPDFLibrary) -> None:
pattern = r"^.*\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b.*$"
matches = pdf_lib.find_matches(pattern)
assert len(matches) > 0, f"No matches found for pattern: {pattern}"
log.info(f"Show all the lines containing an e-mail address: {matches}")


def print_values_from_row() -> None:
elements = find_row(PDF_INVOICE_TABLE_FILE_PATH, 'Test Item')
log.info(f"Elements in searched row: {elements}")


def print_values_from_column() -> None:
column = find_column(PDF_INVOICE_TABLE_FILE_PATH, 'Total gross')
log.info(f"Column: {column}")


@task
def extract_text_data() -> None:
pdf_lib = PyPDFLibrary()

try:
pdf_lib.open_pdf(PDF_INVOICE_FILE_PATH)
print_all_text(pdf_lib)
find_service_description(pdf_lib)
find_web_design_service_value(pdf_lib)
find_prices(pdf_lib)
find_lines_with_email_addresses(pdf_lib)
finally:
pdf_lib.close()


@task
def extract_elements_from_table() -> None:
print_values_from_row()
print_values_from_column()


Loading