Call PDFBox functions using jpype (#10).

lebedov · Jul 29, 2019 · 52ab4fb · 52ab4fb
1 parent 2afe04b
commit 52ab4fb
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 63 deletions.
diff --git a/pdfbox/__init__.py b/pdfbox/__init__.py
@@ -12,8 +12,9 @@
 import urllib.request
 
 import appdirs
+import jpype
+import jpype.imports
 import pkg_resources
-import sarge
 
 pdfbox_archive_url = 'https://archive.apache.org/dist/pdfbox/'
 import os
@@ -47,9 +48,9 @@ class PDFBox(object):
     pdf_to_images(input_path, password=None,
                   imageType=None, outputPrefix=None,
                   startPage=None, endPage=None,
-                  page=None, dpi=None, color=None, cropbox=None,time=True)
+                  page=None, dpi=None, color=None, cropbox=None, time=True)
         Extract all pages of PDF file as images.
-    extract_images(self, input_path, password=None, prefix=None,
+    extract_images(input_path, password=None, prefix=None,
                    directJPEG=False)
         Extract all images from a PDF file.
     """
@@ -129,13 +130,14 @@ def f(s):
 
     def __init__(self):
         self.pdfbox_path = self._get_pdfbox_path()
-        self.java_path = shutil.which('java')
-        if not self.java_path:
-            raise RuntimeError('java not found')
+        jpype.addClassPath(self.pdfbox_path)
+        jpype.startJVM(convertStrings=False)
+        import org.apache.pdfbox.tools as tools
+        self.pdfbox_tools = tools
 
     def extract_text(self, input_path, output_path='',
                      password=None, encoding=None, html=False, sort=False,
-                     ignore_beads=False, start_page=1, end_page=None):
+                     ignore_beads=False, start_page=1, end_page=None, console=False):
         """
         Extract all text from PDF file.
 
@@ -144,7 +146,8 @@ def extract_text(self, input_path, output_path='',
         input_path : str
             Input PDF file.
         output_path : str
-            Output text file. If not specified, the extracted text is returned.
+            Output text file. If not specified, the extracted text is written to
+            a text file with the same basename as the input file.
         password : str
             PDF password.
         encoding : str
@@ -159,11 +162,8 @@ def extract_text(self, input_path, output_path='',
             First page to extract (starting with 1).
         end_page : int
             Last page to extract (starting with 1).
-
-        Returns
-        -------
-        text : str
-            Extracted text. If `output_path` is not specified, nothing is returned.
+        console : bool
+            If True, write output to console.
         """
 
         options = (' -password {password}'.format(password=password) if password else '') +\
@@ -172,17 +172,12 @@ def extract_text(self, input_path, output_path='',
                   (' -sort' if sort else '') +\
                   (' -ignoreBeads' if ignore_beads else '') +\
                   (' -startPage {start_page}'.format(start_page=start_page) if start_page else '') +\
-                  (' -endPage {end_page}'.format(end_page=end_page) if end_page else '')
-        if not output_path:
-            options += ' -console'
-        cmd = '{java_path} -jar {pdfbox_path} ExtractText {options} {input_path} {output_path}'.format(java_path=self.java_path,
-                                                                                                       pdfbox_path=self.pdfbox_path,
-                                                                                                       options=options,
-                                                                                                       input_path=input_path,
-                                                                                                       output_path=output_path)
-        p = sarge.capture_stdout(cmd)
-        if not output_path:
-            return p.stdout.text
+                  (' -endPage {end_page}'.format(end_page=end_page) if end_page else '') +\
+                  '{console}'.format(console='-console') if console else ''
+        cmd = '{options} {input_path} {output_path}'.format(options=options,
+                                                            input_path=str(pathlib.Path(input_path).expanduser()),
+                                                            output_path=output_path).strip()
+        self.pdfbox_tools.ExtractText.main(cmd.split(' '))
 
     def pdf_to_images(self, input_path, password=None,
                       imageType=None, outputPrefix=None,
@@ -224,11 +219,6 @@ def pdf_to_images(self, input_path, password=None,
             The page area to export, e.g "34 45 56 67"
         time : int
             Prints timing information to stdout.
-
-        Returns
-        -------
-        text : str
-            Time taken to complete the process.
         """
 
         options = (' -password {password}'.format(password=password) if password else '') + \
@@ -242,12 +232,9 @@ def pdf_to_images(self, input_path, password=None,
                   (' -cropbox {cropbox}'.format(cropbox=cropbox) if cropbox else '') + \
                   (' {time}'.format(time="-time") if time else '')
 
-        cmd = '{java_path} -jar {pdfbox_path} PDFToImage {options} {input_path}'.format(java_path=self.java_path,
-                                                                                                       pdfbox_path=self.pdfbox_path,
-                                                                                                       options=options,
-                                                                                                       input_path=input_path)
-        p = sarge.capture_both(cmd)
-        return p.stderr.text
+        cmd = '{options} {input_path}'.format(options=options,                                              
+                                              input_path=input_path).strip()
+        self.pdfbox_tools.PDFToImage.main(cmd.split(' '))
 
     def extract_images(self, input_path, password=None, prefix=None, directJPEG=False):
         """
@@ -263,20 +250,12 @@ def extract_images(self, input_path, password=None, prefix=None, directJPEG=Fals
             The prefix to the image file (default: name of PDF document).
         directJPEG: bool
             Forces the direct extraction of JPEG images regardless of colorspace (default: False).
-
-        Returns
-        -------
-        text : str
-            Time taken to complete the process.
         """
 
         options = (' -password {password}'.format(password=password) if password else '') + \
                   (' -prefix {prefix}'.format(prefix=prefix) if prefix else '') + \
                   (' -directJPEG {directJPEG}'.format(directJPEG="-directJPEG") if directJPEG else '')
 
-        cmd = '{java_path} -jar {pdfbox_path} ExtractImages {options} {input_path}'.format(java_path=self.java_path,
-                                                                                        pdfbox_path=self.pdfbox_path,
-                                                                                        options=options,
-                                                                                        input_path=input_path)
-        p = sarge.capture_both(cmd)
-        return p.stderr.text
+        cmd = '{options} {input_path}'.format(options=options,
+                                              input_path=input_path).strip()
+        self.pdfbox_tools.ExtractImages.main(cmd.split(' '))
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 from setuptools import setup
 
 NAME =               'python-pdfbox'
-VERSION =            '0.1.6'
+VERSION =            '0.1.7'
 AUTHOR =             'Lev E. Givon'
 AUTHOR_EMAIL =       '[email protected]'
 URL =                'https://github.com/lebedov/python-pdfbox/'
@@ -43,4 +43,4 @@
         url = URL,
         packages = find_packages(),
 	python_requires='>=3',
-        install_requires = ['appdirs', 'sarge', 'setuptools'])
+        install_requires = ['appdirs', 'jpype1', 'setuptools'])
diff --git a/tests/test_pdfbox.py b/tests/test_pdfbox.py
@@ -4,35 +4,33 @@
 from pathlib import Path
 import pdfbox
 import os
-from sys import platform
+from tempfile import TemporaryDirectory
+
 # To generate test PDF, process test.md with pandoc using the command
 # pandoc -t latex test.md -o test.pdf
-from tempfile import TemporaryDirectory
+
 class test_pdfbox(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.p = pdfbox.PDFBox()
+
     def test_extract_text(self):
-        p = pdfbox.PDFBox()
-        text = p.extract_text('./test.pdf')
-        if platform == "linux" or platform == "linux2" or platform == "darwin":
-            self.assertEqual(text, 'this is a test PDF\n')
-        elif platform == "win32":
-            self.assertEqual(text, 'this is a test PDF\r\n')
+        with TemporaryDirectory() as output_dir:
+            output_path = (Path(output_dir) / 'test.txt').resolve()
+            self.p.extract_text('./test.pdf', output_path)
+            self.assertTrue('test.txt' in os.listdir(output_dir))
 
     def test_pdf_to_images(self):
-        p = pdfbox.PDFBox()
-
         with TemporaryDirectory() as output_dir:
             output_prefix = (Path(output_dir) / 'test').resolve()
-            result = p.pdf_to_images('./test2.pdf', outputPrefix=output_prefix)
+            self.p.pdf_to_images('./test2.pdf', outputPrefix=output_prefix)
             self.assertTrue('test1.jpg' in os.listdir(output_dir) and 'test2.jpg' in os.listdir(output_dir))
 
     def test_extract_images(self):
-        p = pdfbox.PDFBox()
-
         with TemporaryDirectory() as output_dir:
             output_prefix = (Path(output_dir) / 'test').resolve()
-            result = p.extract_images('./test3.pdf', prefix=output_prefix)
+            self.p.extract_images('./test3.pdf', prefix=output_prefix)
             self.assertTrue('test-1.png' in os.listdir(output_dir))
 
-
 if __name__ == '__main__':
     main()