Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADD: output monochrome image support #174

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ convert_from_path(
grayscale=False,
size=None,
paths_only=False,
mono=False,
)

convert_from_bytes(
Expand All @@ -47,6 +48,7 @@ convert_from_bytes(
grayscale=False,
size=None,
paths_only=False,
mono=False,
)
```

Expand Down
16 changes: 16 additions & 0 deletions pdf2image/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,22 @@
from PIL import Image


def parse_buffer_to_pbm(data):
"""Parse PBM file bytes to Pillow Image"""

images = []

index = 0

while index < len(data):
code, size = tuple(data[index : index + 40].split(b"\n")[0:2]) # no rgb section in pbm
size_x, size_y = tuple(size.split(b" "))
file_size = len(code) + len(size) + 2 + int(size_x) * int(size_y) * 3
images.append(Image.open(BytesIO(data[index : index + file_size])))
index += file_size

return images

def parse_buffer_to_ppm(data):
"""Parse PPM file bytes to Pillow Image"""

Expand Down
17 changes: 14 additions & 3 deletions pdf2image/pdf2image.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .generators import uuid_generator, counter_generator, ThreadSafeGenerator

from .parsers import (
parse_buffer_to_pbm,
parse_buffer_to_pgm,
parse_buffer_to_ppm,
parse_buffer_to_jpeg,
Expand Down Expand Up @@ -55,6 +56,7 @@ def convert_from_path(
paths_only=False,
use_pdftocairo=False,
timeout=None,
mono=False,
):
"""
Description: Convert PDF to Image will throw whenever one of the condition is reached
Expand Down Expand Up @@ -98,7 +100,7 @@ def convert_from_path(

# We start by getting the output format, the buffer processing function and if we need pdftocairo
parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format(
fmt, grayscale
fmt, grayscale, mono
)

# We use pdftocairo is the format requires it OR we need a transparent output
Expand Down Expand Up @@ -170,6 +172,7 @@ def convert_from_path(
single_file,
grayscale,
size,
mono
)

if use_pdfcairo:
Expand Down Expand Up @@ -236,6 +239,7 @@ def convert_from_bytes(
paths_only=False,
use_pdftocairo=False,
timeout=None,
mono=False,
):
"""
Description: Convert PDF to Image will throw whenever one of the condition is reached
Expand Down Expand Up @@ -288,6 +292,7 @@ def convert_from_bytes(
paths_only=paths_only,
use_pdftocairo=use_pdftocairo,
timeout=timeout,
mono=mono
)
finally:
os.close(fh)
Expand All @@ -308,6 +313,7 @@ def _build_command(
single_file,
grayscale,
size,
mono
):
if use_cropbox:
args.append("-cropbox")
Expand All @@ -321,7 +327,7 @@ def _build_command(
if last_page is not None:
args.extend(["-l", str(last_page)])

if fmt not in ["pgm", "ppm"]:
if fmt not in ["pgm", "ppm", "pbm"]:
args.append("-" + fmt)

if fmt in ["jpeg", "jpg"] and jpegopt:
Expand All @@ -339,6 +345,9 @@ def _build_command(
if grayscale:
args.append("-gray")

if mono:
args.append("-mono")

if size is None:
pass
elif isinstance(size, tuple) and len(size) == 2:
Expand All @@ -360,7 +369,7 @@ def _build_command(
return args


def _parse_format(fmt, grayscale=False):
def _parse_format(fmt, grayscale=False, mono=False):
fmt = fmt.lower()
if fmt[0] == ".":
fmt = fmt[1:]
Expand All @@ -372,6 +381,8 @@ def _parse_format(fmt, grayscale=False):
return "tiff", "tif", None, True
if fmt == "ppm" and grayscale:
return "pgm", "pgm", parse_buffer_to_pgm, False
if fmt == "ppm" and mono:
return "pbm", "pbm", parse_buffer_to_pbm, False
# Unable to parse the format so we'll use the default
return "ppm", "ppm", parse_buffer_to_ppm, False

Expand Down
48 changes: 47 additions & 1 deletion tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1228,6 +1228,52 @@ def test_conversion_to_grayscale_from_path_using_dir(self):
)
)

## Test mono option

@profile
@unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!")
def test_conversion_to_mono_from_bytes(self):
start_time = time.time()
with open("./tests/test_14.pdf", "rb") as pdf_file:
images_from_bytes = convert_from_bytes(pdf_file.read(), mono=True)
self.assertTrue(images_from_bytes[0].mode == "1")
print(
"test_conversion_to_mono_from_bytes_14: {} sec".format(
(time.time() - start_time) / 14.0
)
)

@profile
@unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!")
def test_conversion_to_mono_from_path(self):
start_time = time.time()
images_from_path = convert_from_path("./tests/test_14.pdf", mono=True)
print("images_from_path 2 ", images_from_path)
self.assertTrue(images_from_path[0].mode == "1")
[im.close() for im in images_from_path]
print(
"test_conversion_to_mono_from_path_14: {} sec".format(
(time.time() - start_time) / 14.0
)
)

@profile
@unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!")
def test_conversion_to_mono_from_path_using_dir(self):
start_time = time.time()
with TemporaryDirectory() as path:
images_from_path = convert_from_path(
"./tests/test_14.pdf", output_folder=path, mono=True
)
print("images_from_path 1", images_from_path)
self.assertTrue(images_from_path[0].mode == "1")
[im.close() for im in images_from_path]
print(
"test_conversion_to_mono_from_path_using_dir_14: {} sec".format(
(time.time() - start_time) / 14.0
)
)

## Test pathlib support

@profile
Expand Down Expand Up @@ -1642,7 +1688,7 @@ def test_pdfinfo_functions_same_number_of_parameters(self):
len(signature(pdfinfo_from_bytes).parameters),
)
print("test_pdfinfo_functions_same_number_of_parameters: {} sec".format(time.time() - start_time))

@unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!")
def test_timeout_pdfinfo_from_path_241(self):
start_time = time.time()
Expand Down