Skip to content

Commit

Permalink
PDFunite: fix too many open files
Browse files Browse the repository at this point in the history
In large (1200+) PDFs the PDFunite command would fail on some systems
(e.g. Qubes), because it would be called with 1024+ files, leading up
to too many files open (`ulimit -n`).

This solution splits the merging into batches, accumulating the results
in a single PDF and then merging it with the next batch.
  • Loading branch information
deeplow committed Nov 2, 2023
1 parent ebfed4e commit f7190e3
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 18 deletions.
8 changes: 8 additions & 0 deletions dangerzone/conversion/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@ def batch_iterator(num_pages: int) -> Generator[Tuple[int, int], None, None]:
yield (first_page, last_page)


def get_batch_timeout(timeout: Optional[float], num_pages: int) -> Optional[float]:
if timeout is None:
return None
else:
num_batches = int(num_pages / PAGE_BATCH_SIZE)
return timeout / num_batches


class DangerzoneConverter:
def __init__(self, progress_callback: Optional[Callable] = None) -> None:
self.percentage: float = 0.0
Expand Down
6 changes: 2 additions & 4 deletions dangerzone/conversion/doc_to_pixels.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
PAGE_BATCH_SIZE,
DangerzoneConverter,
batch_iterator,
get_batch_timeout,
running_on_qubes,
)

Expand Down Expand Up @@ -283,10 +284,7 @@ async def convert(self) -> None:
# Get a more precise timeout, based on the number of pages
timeout = self.calculate_timeout(size, num_pages)

if timeout is None:
timeout_per_batch = None
else:
timeout_per_batch = timeout / (int(num_pages / PAGE_BATCH_SIZE) + 1)
timeout_per_batch = get_batch_timeout(timeout, num_pages)
for first_page, last_page in batch_iterator(num_pages):
# XXX send data from the previous loop's conversion to
# always be able to process and send data at the same time
Expand Down
42 changes: 28 additions & 14 deletions dangerzone/conversion/pixels_to_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,12 @@
import sys
from typing import Optional

from .common import DangerzoneConverter, running_on_qubes
from .common import (
DangerzoneConverter,
batch_iterator,
get_batch_timeout,
running_on_qubes,
)


class PixelsToPDF(DangerzoneConverter):
Expand Down Expand Up @@ -89,20 +94,29 @@ async def convert(
timeout = self.calculate_timeout(total_size, num_pages)

# Merge pages into a single PDF
timeout_per_batch = get_batch_timeout(timeout, num_pages)
self.update_progress(f"Merging {num_pages} pages into a single PDF")
args = ["pdfunite"]
for page in range(1, num_pages + 1):
args.append(f"{tempdir}/page-{page}.pdf")
args.append(f"{tempdir}/safe-output.pdf")
await self.run_command(
args,
error_message="Merging pages into a single PDF failed",
timeout_message=(
"Error merging pages into a single PDF, pdfunite timed out after"
f" {timeout} seconds"
),
timeout=timeout,
)
for first_page, last_page in batch_iterator(num_pages):
args = ["pdfunite"]
accumulator = f"{tempdir}/safe-output.pdf" # PDF which accumulates pages
accumulator_temp = f"{tempdir}/safe-output_tmp.pdf"
if first_page > 1: # Append at the beginning
args.append(accumulator)
for page in range(first_page, last_page + 1):
args.append(f"{tempdir}/page-{page}.pdf")
args.append(accumulator_temp)
await self.run_command(
args,
error_message="Merging pages into a single PDF failed",
timeout_message=(
"Error merging pages into a single PDF, pdfunite timed out after"
f" {timeout_per_batch} seconds"
),
timeout=timeout_per_batch,
)
for page in range(first_page, last_page + 1):
os.remove(f"{tempdir}/page-{page}.pdf")
os.rename(accumulator_temp, accumulator)

self.percentage += 2

Expand Down

0 comments on commit f7190e3

Please sign in to comment.