Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pypdfium2 rendering backend #384

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions camelot/backends/image_conversion.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
from .pdfium_backend import PdfiumBackend
from .ghostscript_backend import GhostscriptBackend
from .poppler_backend import PopplerBackend


BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
BACKENDS = {
"pdfium": PdfiumBackend,
"poppler": PopplerBackend,
"ghostscript": GhostscriptBackend,
mara004 marked this conversation as resolved.
Show resolved Hide resolved
}


class ImageConversionBackend:
def __init__(self, backend="poppler", use_fallback=True):
def __init__(self, backend="pdfium", use_fallback=True):
if backend not in BACKENDS.keys():
raise ValueError(f"Image conversion backend '{backend}' not supported")

self.backend = backend
self.use_fallback = use_fallback
self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))
self.fallbacks = list(BACKENDS.keys())
self.fallbacks.remove(self.backend)

def convert(self, pdf_path, png_path):
try:
Expand Down
18 changes: 18 additions & 0 deletions camelot/backends/pdfium_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
try:
import pypdfium2 as pdfium
except Exception as e:
pdfium = None
pdfium_exc = e
else:
pdfium_exc = None


class PdfiumBackend:
def convert(self, pdf_path, png_path, resolution=300):
if not pdfium:
raise OSError(f"pypdfium2 is not available: {pdfium_exc!r}")
doc = pdfium.PdfDocument(pdf_path)
assert len(doc) == 1
mara004 marked this conversation as resolved.
Show resolved Hide resolved
doc.init_forms()
image = doc[0].render(scale=resolution/72).to_pil()
image.save(png_path)
2 changes: 1 addition & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def mypy(session: Session) -> None:
session.run("mypy", f"--python-executable={sys.executable}", "noxfile.py")


base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17"]
base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17", "pypdfium2>=4,<5"]

plot_requires = [
"matplotlib>=2.2.3",
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ myst-parser = {version = ">=0.16.1"}
camelot = "camelot.__main__:main"

[tool.poetry.group.base.dependencies]
ghostscript = "^0.7"
pypdfium2 = "^4"
ghostscript = "^0.7" # remove in favor of pypdfium2?
opencv-python = "^4.7.0.68"
mara004 marked this conversation as resolved.
Show resolved Hide resolved


Expand Down
37 changes: 37 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ def test_password(testdir):
assert_frame_equal(df, tables[0].df)


@skip_on_windows
def test_repr_pdfium(testdir):
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


@skip_pdftopng
def test_repr_poppler(testdir):
filename = os.path.join(testdir, "foo.pdf")
Expand All @@ -68,6 +77,15 @@ def test_repr_ghostscript(testdir):
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


@skip_on_windows
def test_url_pdfium():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


@skip_pdftopng
def test_url_poppler():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
Expand All @@ -86,6 +104,25 @@ def test_url_ghostscript(testdir):
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


@skip_on_windows
def test_pages_pdfium():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"

tables = camelot.read_pdf(url, pages="1-end", backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"

tables = camelot.read_pdf(url, pages="all", backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


@skip_pdftopng
def test_pages_poppler():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
Expand Down
8 changes: 5 additions & 3 deletions tests/test_image_conversion_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from camelot.backends import ImageConversionBackend

# TODO consider adding pdfium backend


@pytest.fixture
def patch_backends(monkeypatch):
Expand Down Expand Up @@ -31,15 +33,15 @@ def convert(self, pdf_path, png_path):


def test_poppler_backend_error_when_no_use_fallback(patch_backends):
backend = ImageConversionBackend(use_fallback=False)
backend = ImageConversionBackend(backend="poppler", use_fallback=False)

message = "Image conversion failed with image conversion backend 'poppler'"
with pytest.raises(ValueError, match=message):
backend.convert("foo", "bar")


def test_ghostscript_backend_when_use_fallback(patch_backends):
backend = ImageConversionBackend()
backend = ImageConversionBackend(backend="ghostscript")
backend.convert("foo", "bar")


Expand All @@ -49,7 +51,7 @@ def test_ghostscript_backend_error_when_use_fallback(monkeypatch):
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", backends, raising=True
)
backend = ImageConversionBackend()
backend = ImageConversionBackend(backend="poppler")

message = "Image conversion failed with image conversion backend 'ghostscript'"
with pytest.raises(ValueError, match=message):
Expand Down