From f7190e3876ce3ab33fb0db9b6e94dc9ff872a531 Mon Sep 17 00:00:00 2001 From: deeplow Date: Thu, 2 Nov 2023 15:00:10 +0000 Subject: [PATCH] PDFunite: fix too many open files In large (1200+) PDFs the PDFunite command would fail on some systems (e.g. Qubes), because it would be called with 1024+ files, leading up to too many files open (`ulimit -n`). This solution splits the merging into batches, accumulating the results in a single PDF and then merging it with the next batch. --- dangerzone/conversion/common.py | 8 +++++ dangerzone/conversion/doc_to_pixels.py | 6 ++-- dangerzone/conversion/pixels_to_pdf.py | 42 +++++++++++++++++--------- 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index f15954371..768118905 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -56,6 +56,14 @@ def batch_iterator(num_pages: int) -> Generator[Tuple[int, int], None, None]: yield (first_page, last_page) +def get_batch_timeout(timeout: Optional[float], num_pages: int) -> Optional[float]: + if timeout is None: + return None + else: + num_batches = int(num_pages / PAGE_BATCH_SIZE) + return timeout / num_batches + + class DangerzoneConverter: def __init__(self, progress_callback: Optional[Callable] = None) -> None: self.percentage: float = 0.0 diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py index f871bf470..70edae32f 100644 --- a/dangerzone/conversion/doc_to_pixels.py +++ b/dangerzone/conversion/doc_to_pixels.py @@ -23,6 +23,7 @@ PAGE_BATCH_SIZE, DangerzoneConverter, batch_iterator, + get_batch_timeout, running_on_qubes, ) @@ -283,10 +284,7 @@ async def convert(self) -> None: # Get a more precise timeout, based on the number of pages timeout = self.calculate_timeout(size, num_pages) - if timeout is None: - timeout_per_batch = None - else: - timeout_per_batch = timeout / (int(num_pages / PAGE_BATCH_SIZE) + 1) + timeout_per_batch = get_batch_timeout(timeout, num_pages) for first_page, last_page in batch_iterator(num_pages): # XXX send data from the previous loop's conversion to # always be able to process and send data at the same time diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index c60ee818d..dfaab61eb 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -13,7 +13,12 @@ import sys from typing import Optional -from .common import DangerzoneConverter, running_on_qubes +from .common import ( + DangerzoneConverter, + batch_iterator, + get_batch_timeout, + running_on_qubes, +) class PixelsToPDF(DangerzoneConverter): @@ -89,20 +94,29 @@ async def convert( timeout = self.calculate_timeout(total_size, num_pages) # Merge pages into a single PDF + timeout_per_batch = get_batch_timeout(timeout, num_pages) self.update_progress(f"Merging {num_pages} pages into a single PDF") - args = ["pdfunite"] - for page in range(1, num_pages + 1): - args.append(f"{tempdir}/page-{page}.pdf") - args.append(f"{tempdir}/safe-output.pdf") - await self.run_command( - args, - error_message="Merging pages into a single PDF failed", - timeout_message=( - "Error merging pages into a single PDF, pdfunite timed out after" - f" {timeout} seconds" - ), - timeout=timeout, - ) + for first_page, last_page in batch_iterator(num_pages): + args = ["pdfunite"] + accumulator = f"{tempdir}/safe-output.pdf" # PDF which accumulates pages + accumulator_temp = f"{tempdir}/safe-output_tmp.pdf" + if first_page > 1: # Append at the beginning + args.append(accumulator) + for page in range(first_page, last_page + 1): + args.append(f"{tempdir}/page-{page}.pdf") + args.append(accumulator_temp) + await self.run_command( + args, + error_message="Merging pages into a single PDF failed", + timeout_message=( + "Error merging pages into a single PDF, pdfunite timed out after" + f" {timeout_per_batch} seconds" + ), + timeout=timeout_per_batch, + ) + for page in range(first_page, last_page + 1): + os.remove(f"{tempdir}/page-{page}.pdf") + os.rename(accumulator_temp, accumulator) self.percentage += 2