Skip to content

Commit

Permalink
Call PDFBox functions using jpype (#10).
Browse files Browse the repository at this point in the history
  • Loading branch information
lebedov committed Jul 29, 2019
1 parent 2afe04b commit 52ab4fb
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 63 deletions.
71 changes: 25 additions & 46 deletions pdfbox/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
import urllib.request

import appdirs
import jpype
import jpype.imports
import pkg_resources
import sarge

pdfbox_archive_url = 'https://archive.apache.org/dist/pdfbox/'
import os
Expand Down Expand Up @@ -47,9 +48,9 @@ class PDFBox(object):
pdf_to_images(input_path, password=None,
imageType=None, outputPrefix=None,
startPage=None, endPage=None,
page=None, dpi=None, color=None, cropbox=None,time=True)
page=None, dpi=None, color=None, cropbox=None, time=True)
Extract all pages of PDF file as images.
extract_images(self, input_path, password=None, prefix=None,
extract_images(input_path, password=None, prefix=None,
directJPEG=False)
Extract all images from a PDF file.
"""
Expand Down Expand Up @@ -129,13 +130,14 @@ def f(s):

def __init__(self):
self.pdfbox_path = self._get_pdfbox_path()
self.java_path = shutil.which('java')
if not self.java_path:
raise RuntimeError('java not found')
jpype.addClassPath(self.pdfbox_path)
jpype.startJVM(convertStrings=False)
import org.apache.pdfbox.tools as tools
self.pdfbox_tools = tools

def extract_text(self, input_path, output_path='',
password=None, encoding=None, html=False, sort=False,
ignore_beads=False, start_page=1, end_page=None):
ignore_beads=False, start_page=1, end_page=None, console=False):
"""
Extract all text from PDF file.
Expand All @@ -144,7 +146,8 @@ def extract_text(self, input_path, output_path='',
input_path : str
Input PDF file.
output_path : str
Output text file. If not specified, the extracted text is returned.
Output text file. If not specified, the extracted text is written to
a text file with the same basename as the input file.
password : str
PDF password.
encoding : str
Expand All @@ -159,11 +162,8 @@ def extract_text(self, input_path, output_path='',
First page to extract (starting with 1).
end_page : int
Last page to extract (starting with 1).
Returns
-------
text : str
Extracted text. If `output_path` is not specified, nothing is returned.
console : bool
If True, write output to console.
"""

options = (' -password {password}'.format(password=password) if password else '') +\
Expand All @@ -172,17 +172,12 @@ def extract_text(self, input_path, output_path='',
(' -sort' if sort else '') +\
(' -ignoreBeads' if ignore_beads else '') +\
(' -startPage {start_page}'.format(start_page=start_page) if start_page else '') +\
(' -endPage {end_page}'.format(end_page=end_page) if end_page else '')
if not output_path:
options += ' -console'
cmd = '{java_path} -jar {pdfbox_path} ExtractText {options} {input_path} {output_path}'.format(java_path=self.java_path,
pdfbox_path=self.pdfbox_path,
options=options,
input_path=input_path,
output_path=output_path)
p = sarge.capture_stdout(cmd)
if not output_path:
return p.stdout.text
(' -endPage {end_page}'.format(end_page=end_page) if end_page else '') +\
'{console}'.format(console='-console') if console else ''
cmd = '{options} {input_path} {output_path}'.format(options=options,
input_path=str(pathlib.Path(input_path).expanduser()),
output_path=output_path).strip()
self.pdfbox_tools.ExtractText.main(cmd.split(' '))

def pdf_to_images(self, input_path, password=None,
imageType=None, outputPrefix=None,
Expand Down Expand Up @@ -224,11 +219,6 @@ def pdf_to_images(self, input_path, password=None,
The page area to export, e.g "34 45 56 67"
time : int
Prints timing information to stdout.
Returns
-------
text : str
Time taken to complete the process.
"""

options = (' -password {password}'.format(password=password) if password else '') + \
Expand All @@ -242,12 +232,9 @@ def pdf_to_images(self, input_path, password=None,
(' -cropbox {cropbox}'.format(cropbox=cropbox) if cropbox else '') + \
(' {time}'.format(time="-time") if time else '')

cmd = '{java_path} -jar {pdfbox_path} PDFToImage {options} {input_path}'.format(java_path=self.java_path,
pdfbox_path=self.pdfbox_path,
options=options,
input_path=input_path)
p = sarge.capture_both(cmd)
return p.stderr.text
cmd = '{options} {input_path}'.format(options=options,
input_path=input_path).strip()
self.pdfbox_tools.PDFToImage.main(cmd.split(' '))

def extract_images(self, input_path, password=None, prefix=None, directJPEG=False):
"""
Expand All @@ -263,20 +250,12 @@ def extract_images(self, input_path, password=None, prefix=None, directJPEG=Fals
The prefix to the image file (default: name of PDF document).
directJPEG: bool
Forces the direct extraction of JPEG images regardless of colorspace (default: False).
Returns
-------
text : str
Time taken to complete the process.
"""

options = (' -password {password}'.format(password=password) if password else '') + \
(' -prefix {prefix}'.format(prefix=prefix) if prefix else '') + \
(' -directJPEG {directJPEG}'.format(directJPEG="-directJPEG") if directJPEG else '')

cmd = '{java_path} -jar {pdfbox_path} ExtractImages {options} {input_path}'.format(java_path=self.java_path,
pdfbox_path=self.pdfbox_path,
options=options,
input_path=input_path)
p = sarge.capture_both(cmd)
return p.stderr.text
cmd = '{options} {input_path}'.format(options=options,
input_path=input_path).strip()
self.pdfbox_tools.ExtractImages.main(cmd.split(' '))
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from setuptools import setup

NAME = 'python-pdfbox'
VERSION = '0.1.6'
VERSION = '0.1.7'
AUTHOR = 'Lev E. Givon'
AUTHOR_EMAIL = '[email protected]'
URL = 'https://github.com/lebedov/python-pdfbox/'
Expand Down Expand Up @@ -43,4 +43,4 @@
url = URL,
packages = find_packages(),
python_requires='>=3',
install_requires = ['appdirs', 'sarge', 'setuptools'])
install_requires = ['appdirs', 'jpype1', 'setuptools'])
28 changes: 13 additions & 15 deletions tests/test_pdfbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,33 @@
from pathlib import Path
import pdfbox
import os
from sys import platform
from tempfile import TemporaryDirectory

# To generate test PDF, process test.md with pandoc using the command
# pandoc -t latex test.md -o test.pdf
from tempfile import TemporaryDirectory

class test_pdfbox(TestCase):
@classmethod
def setUpClass(cls):
cls.p = pdfbox.PDFBox()

def test_extract_text(self):
p = pdfbox.PDFBox()
text = p.extract_text('./test.pdf')
if platform == "linux" or platform == "linux2" or platform == "darwin":
self.assertEqual(text, 'this is a test PDF\n')
elif platform == "win32":
self.assertEqual(text, 'this is a test PDF\r\n')
with TemporaryDirectory() as output_dir:
output_path = (Path(output_dir) / 'test.txt').resolve()
self.p.extract_text('./test.pdf', output_path)
self.assertTrue('test.txt' in os.listdir(output_dir))

def test_pdf_to_images(self):
p = pdfbox.PDFBox()

with TemporaryDirectory() as output_dir:
output_prefix = (Path(output_dir) / 'test').resolve()
result = p.pdf_to_images('./test2.pdf', outputPrefix=output_prefix)
self.p.pdf_to_images('./test2.pdf', outputPrefix=output_prefix)
self.assertTrue('test1.jpg' in os.listdir(output_dir) and 'test2.jpg' in os.listdir(output_dir))

def test_extract_images(self):
p = pdfbox.PDFBox()

with TemporaryDirectory() as output_dir:
output_prefix = (Path(output_dir) / 'test').resolve()
result = p.extract_images('./test3.pdf', prefix=output_prefix)
self.p.extract_images('./test3.pdf', prefix=output_prefix)
self.assertTrue('test-1.png' in os.listdir(output_dir))


if __name__ == '__main__':
main()

0 comments on commit 52ab4fb

Please sign in to comment.