PDFunite: fix too many open files

In large (1200+) PDFs the PDFunite command would fail on some systems (e.g. Qubes), because it would be called with 1024+ files, leading up to too many files open (`ulimit -n`). This solution splits the merging into batches, accumulating the results in a single PDF and then merging it with the next batch.
freedomofpress · Nov 2, 2023 · f7190e3 · f7190e3
1 parent ebfed4e
commit f7190e3
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 18 deletions.
diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py
@@ -56,6 +56,14 @@ def batch_iterator(num_pages: int) -> Generator[Tuple[int, int], None, None]:
         yield (first_page, last_page)
 
 
+def get_batch_timeout(timeout: Optional[float], num_pages: int) -> Optional[float]:
+    if timeout is None:
+        return None
+    else:
+        num_batches = int(num_pages / PAGE_BATCH_SIZE)
+        return timeout / num_batches
+
+
 class DangerzoneConverter:
     def __init__(self, progress_callback: Optional[Callable] = None) -> None:
         self.percentage: float = 0.0

diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py
@@ -23,6 +23,7 @@
     PAGE_BATCH_SIZE,
     DangerzoneConverter,
     batch_iterator,
+    get_batch_timeout,
     running_on_qubes,
 )
 
@@ -283,10 +284,7 @@ async def convert(self) -> None:
         # Get a more precise timeout, based on the number of pages
         timeout = self.calculate_timeout(size, num_pages)
 
-        if timeout is None:
-            timeout_per_batch = None
-        else:
-            timeout_per_batch = timeout / (int(num_pages / PAGE_BATCH_SIZE) + 1)
+        timeout_per_batch = get_batch_timeout(timeout, num_pages)
         for first_page, last_page in batch_iterator(num_pages):
             # XXX send data from the previous loop's conversion to
             # always be able to process and send data at the same time

diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py
@@ -13,7 +13,12 @@
 import sys
 from typing import Optional
 
-from .common import DangerzoneConverter, running_on_qubes
+from .common import (
+    DangerzoneConverter,
+    batch_iterator,
+    get_batch_timeout,
+    running_on_qubes,
+)
 
 
 class PixelsToPDF(DangerzoneConverter):
@@ -89,20 +94,29 @@ async def convert(
         timeout = self.calculate_timeout(total_size, num_pages)
 
         # Merge pages into a single PDF
+        timeout_per_batch = get_batch_timeout(timeout, num_pages)
         self.update_progress(f"Merging {num_pages} pages into a single PDF")
-        args = ["pdfunite"]
-        for page in range(1, num_pages + 1):
-            args.append(f"{tempdir}/page-{page}.pdf")
-        args.append(f"{tempdir}/safe-output.pdf")
-        await self.run_command(
-            args,
-            error_message="Merging pages into a single PDF failed",
-            timeout_message=(
-                "Error merging pages into a single PDF, pdfunite timed out after"
-                f" {timeout} seconds"
-            ),
-            timeout=timeout,
-        )
+        for first_page, last_page in batch_iterator(num_pages):
+            args = ["pdfunite"]
+            accumulator = f"{tempdir}/safe-output.pdf"  # PDF which accumulates pages
+            accumulator_temp = f"{tempdir}/safe-output_tmp.pdf"
+            if first_page > 1:  # Append at the beginning
+                args.append(accumulator)
+            for page in range(first_page, last_page + 1):
+                args.append(f"{tempdir}/page-{page}.pdf")
+            args.append(accumulator_temp)
+            await self.run_command(
+                args,
+                error_message="Merging pages into a single PDF failed",
+                timeout_message=(
+                    "Error merging pages into a single PDF, pdfunite timed out after"
+                    f" {timeout_per_batch} seconds"
+                ),
+                timeout=timeout_per_batch,
+            )
+            for page in range(first_page, last_page + 1):
+                os.remove(f"{tempdir}/page-{page}.pdf")
+            os.rename(accumulator_temp, accumulator)
 
         self.percentage += 2