From 773633b09baab78b0f252748e158c2cbb832d738 Mon Sep 17 00:00:00 2001 From: deeplow Date: Thu, 25 Jan 2024 11:34:11 +0000 Subject: [PATCH] Remove timeouts Remove timeouts due to several reasons: 1. Lost purpose: after implementing the containers page streaming the only subprocess we have left is LibreOffice. So don't have such a big risk of commands hanging (the original reason for timeouts). 2. Little benefit: predicting execution time is generically unsolvable computer science problem. Ultimately we were guessing an arbitrary time based on the number of pages and the document size. As a guess we made it pretty lax (30s per page or MB). A document hanging for this long will probably lead to user frustration in any case and the user may be compelled to abort the conversion. 3. Technical Challenges with non-blocking timeout: there have been several technical challenges in keeping timeouts that we've made effort to accommodate. A significant one was having to do non-blocking read to ensure we could timeout when reading conversion stream (and then used here) Fixes #687 --- CHANGELOG.md | 1 + dangerzone/cli.py | 6 +-- dangerzone/conversion/common.py | 49 +++------------------- dangerzone/conversion/pixels_to_pdf.py | 7 +--- dangerzone/gui/__init__.py | 4 +- dangerzone/isolation_provider/base.py | 43 +++++++------------ dangerzone/isolation_provider/container.py | 15 +------ dangerzone/isolation_provider/qubes.py | 2 - tests/isolation_provider/test_container.py | 2 +- tests/test_ocr.py | 2 +- 10 files changed, 31 insertions(+), 100 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d312d50b..111991238 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ since 0.4.1, and this project adheres to [Semantic Versioning](https://semver.or - Feature: Add support for HWP/HWPX files (Hancom Office) for macOS Apple Silicon devices ([issue #498](https://github.com/freedomofpress/dangerzone/issues/498), thanks to [@OctopusET](https://github.com/OctopusET)) - Replace Dangerzone document rendering engine from pdftoppm PyMuPDF, essentially replacing a variety of tools (gm / tesseract / pdfunite / ps2pdf) ([issue #658](https://github.com/freedomofpress/dangerzone/issues/658)) +- Removed timeouts ([issue #687](https://github.com/freedomofpress/dangerzone/issues/687)) ## Dangerzone 0.5.1 diff --git a/dangerzone/cli.py b/dangerzone/cli.py index 2361399f6..b582d869e 100644 --- a/dangerzone/cli.py +++ b/dangerzone/cli.py @@ -39,9 +39,9 @@ def print_header(s: str) -> None: ) @click.option( "--enable-timeouts / --disable-timeouts", - default=True, + default=False, show_default=True, - help="Enable/Disable timeouts during document conversion", + help="(DEPRECATED - timeouts were removed) Enable/Disable timeouts during document conversion", ) @click.argument( "filenames", @@ -67,7 +67,7 @@ def cli_main( elif is_qubes_native_conversion(): dangerzone = DangerzoneCore(Qubes()) else: - dangerzone = DangerzoneCore(Container(enable_timeouts=enable_timeouts)) + dangerzone = DangerzoneCore(Container()) display_banner() if len(filenames) == 1 and output_filename: diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py index f3649c949..4c92bb8af 100644 --- a/dangerzone/conversion/common.py +++ b/dangerzone/conversion/common.py @@ -12,10 +12,8 @@ from abc import abstractmethod from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union -TIMEOUT_PER_PAGE: float = 30 # (seconds) -TIMEOUT_PER_MB: float = 30 # (seconds) -TIMEOUT_MIN: float = 60 # (seconds) DEFAULT_DPI = 150 # Pixels per inch +INT_BYTES = 2 def running_on_qubes() -> bool: @@ -23,28 +21,6 @@ def running_on_qubes() -> bool: return os.path.exists("/usr/share/qubes/marker-vm") -def calculate_timeout(size: float, pages: Optional[float] = None) -> float: - """Calculate the timeout for a command. - - The timeout calculation takes two factors in mind: - - 1. The size (in MiBs) of the dataset (document, multiple pages). - 2. The number of pages in the dataset. - - It then calculates proportional timeout values based on the above, and keeps the - large one. This way, we can handle several corner cases: - - * Documents with lots of pages, but small file size. - * Single images with large file size. - """ - # Do not have timeouts lower than 10 seconds, if the file size is small, since - # we need to take into account the program's startup time as well. - timeout = max(TIMEOUT_PER_MB * size, TIMEOUT_MIN) - if pages: - timeout = max(timeout, TIMEOUT_PER_PAGE * pages) - return timeout - - def get_tessdata_dir() -> str: if running_on_qubes(): return "/usr/share/tesseract/tessdata/" @@ -76,7 +52,7 @@ def _write_text(cls, text: str, file: TextIO = sys.stdout) -> None: @classmethod def _write_int(cls, num: int, file: TextIO = sys.stdout) -> None: - cls._write_bytes(num.to_bytes(2, "big", signed=False), file=file) + cls._write_bytes(num.to_bytes(INT_BYTES, "big", signed=False), file=file) # ==== ASYNC METHODS ==== # We run sync methods in async wrappers, because pure async methods are more difficult: @@ -127,8 +103,6 @@ async def run_command( args: List[str], *, error_message: str, - timeout_message: Optional[str] = None, - timeout: Optional[float] = None, stdout_callback: Optional[Callable] = None, stderr_callback: Optional[Callable] = None, ) -> Tuple[bytes, bytes]: @@ -138,7 +112,6 @@ async def run_command( output in bytes. :raises RuntimeError: if the process returns a non-zero exit status - :raises TimeoutError: if the process times out """ # Start the provided command, and return a handle. The command will run in the # background. @@ -164,12 +137,9 @@ async def run_command( self.read_stream(proc.stderr, stderr_callback) ) - # Wait until the command has finished, for a specific timeout. Then, verify that the - # command has completed successfully. In any other case, raise an exception. - try: - ret = await asyncio.wait_for(proc.wait(), timeout=timeout) - except asyncio.exceptions.TimeoutError: - raise TimeoutError(timeout_message) + # Wait until the command has finished. Then, verify that the command + # has completed successfully. In any other case, raise an exception. + ret = await asyncio.wait_for(proc.wait(), timeout=None) if ret != 0: raise RuntimeError(error_message) @@ -179,15 +149,6 @@ async def run_command( stderr = await stderr_task return (stdout, stderr) - def calculate_timeout( - self, size: float, pages: Optional[float] = None - ) -> Optional[float]: - """Calculate the timeout for a command.""" - if not int(os.environ.get("ENABLE_TIMEOUTS", 1)): - return None - - return calculate_timeout(size, pages) - @abstractmethod async def convert(self) -> None: pass diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py index 072e197d5..362a769e4 100644 --- a/dangerzone/conversion/pixels_to_pdf.py +++ b/dangerzone/conversion/pixels_to_pdf.py @@ -49,7 +49,6 @@ async def convert( # The first few operations happen on a per-page basis. page_size = len(untrusted_rgb_data) total_size += page_size - timeout = self.calculate_timeout(page_size, 1) pixmap = fitz.Pixmap( fitz.Colorspace(fitz.CS_RGB), width, height, untrusted_rgb_data, False ) @@ -75,10 +74,6 @@ async def convert( safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes)) self.percentage += percentage_per_page - # Next operations apply to the all the pages, so we need to recalculate the - # timeout. - timeout = self.calculate_timeout(total_size, num_pages) - self.percentage = 100.0 self.update_progress("Safe PDF created") @@ -110,7 +105,7 @@ async def main() -> int: try: await converter.convert(ocr_lang) return 0 - except (RuntimeError, TimeoutError, ValueError) as e: + except (RuntimeError, ValueError) as e: converter.update_progress(str(e), error=True) return 1 diff --git a/dangerzone/gui/__init__.py b/dangerzone/gui/__init__.py index c440c0148..b8995f965 100644 --- a/dangerzone/gui/__init__.py +++ b/dangerzone/gui/__init__.py @@ -100,7 +100,7 @@ def infer_os_color_mode(self) -> OSColorMode: "--enable-timeouts / --disable-timeouts", default=True, show_default=True, - help="Enable/Disable timeouts during document conversion", + help="(DEPRECATED - timeouts were removed) Enable/Disable timeouts during document conversion", ) @click.argument( "filenames", @@ -138,7 +138,7 @@ def gui_main( qubes = Qubes() dangerzone = DangerzoneGui(app, isolation_provider=qubes) else: - container = Container(enable_timeouts=enable_timeouts) + container = Container() dangerzone = DangerzoneGui(app, isolation_provider=container) # Allow Ctrl-C to smoothly quit the program instead of throwing an exception diff --git a/dangerzone/isolation_provider/base.py b/dangerzone/isolation_provider/base.py index 3b51d58cf..f637ff382 100644 --- a/dangerzone/isolation_provider/base.py +++ b/dangerzone/isolation_provider/base.py @@ -10,9 +10,9 @@ from colorama import Fore, Style from ..conversion import errors -from ..conversion.common import calculate_timeout +from ..conversion.common import INT_BYTES from ..document import Document -from ..util import Stopwatch, nonblocking_read, replace_control_chars +from ..util import replace_control_chars log = logging.getLogger(__name__) @@ -23,25 +23,26 @@ PIXELS_TO_PDF_LOG_END = "----- PIXELS TO PDF LOG END -----" -def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes: +def read_bytes(f: IO[bytes], size: int, exact: bool = True) -> bytes: """Read bytes from a file-like object.""" - buf = nonblocking_read(f, size, timeout) + buf = f.read(size) if exact and len(buf) != size: raise errors.ConverterProcException() return buf -def read_int(f: IO[bytes], timeout: float) -> int: +def read_int(f: IO[bytes]) -> int: """Read 2 bytes from a file-like object, and decode them as int.""" - untrusted_int = read_bytes(f, 2, timeout) + untrusted_int = f.read(INT_BYTES) + if len(untrusted_int) != INT_BYTES: + raise errors.ConverterProcException() return int.from_bytes(untrusted_int, "big", signed=False) def read_debug_text(f: IO[bytes], size: int) -> str: """Read arbitrarily long text (for debug purposes)""" - timeout = calculate_timeout(size) - untrusted_text = read_bytes(f, size, timeout, exact=False) - return untrusted_text.decode("ascii", errors="replace") + untrusted_text = f.read(size).decode("ascii", errors="replace") + return replace_control_chars(untrusted_text) class IsolationProvider(ABC): @@ -49,8 +50,6 @@ class IsolationProvider(ABC): Abstracts an isolation provider """ - STARTUP_TIME_SECONDS = 0 # The maximum time it takes a the provider to start up. - def __init__(self) -> None: self.percentage = 0.0 self.proc: Optional[subprocess.Popen] = None @@ -105,28 +104,18 @@ def doc_to_pixels(self, document: Document, tempdir: str) -> None: except BrokenPipeError as e: raise errors.ConverterProcException() - # Get file size (in MiB) - size = os.path.getsize(document.input_filename) / 1024**2 - timeout = calculate_timeout(size) + self.STARTUP_TIME_SECONDS - - assert self.proc is not None - assert self.proc.stdout is not None - os.set_blocking(self.proc.stdout.fileno(), False) - - n_pages = read_int(self.proc.stdout, timeout) + assert self.proc.stdout + n_pages = read_int(self.proc.stdout) if n_pages == 0 or n_pages > errors.MAX_PAGES: raise errors.MaxPagesException() percentage_per_page = 50.0 / n_pages - timeout = calculate_timeout(size, n_pages) - sw = Stopwatch(timeout) - sw.start() for page in range(1, n_pages + 1): text = f"Converting page {page}/{n_pages} to pixels" self.print_progress(document, False, text, self.percentage) - width = read_int(self.proc.stdout, timeout=sw.remaining) - height = read_int(self.proc.stdout, timeout=sw.remaining) + width = read_int(self.proc.stdout) + height = read_int(self.proc.stdout) if not (1 <= width <= errors.MAX_PAGE_WIDTH): raise errors.MaxPageWidthException() if not (1 <= height <= errors.MAX_PAGE_HEIGHT): @@ -136,7 +125,6 @@ def doc_to_pixels(self, document: Document, tempdir: str) -> None: untrusted_pixels = read_bytes( self.proc.stdout, num_pixels, - timeout=sw.remaining, ) # Wrapper code @@ -157,8 +145,7 @@ def doc_to_pixels(self, document: Document, tempdir: str) -> None: self.print_progress(document, False, text, self.percentage) if getattr(sys, "dangerzone_dev", False): - assert self.proc.stderr is not None - os.set_blocking(self.proc.stderr.fileno(), False) + assert self.proc.stderr untrusted_log = read_debug_text(self.proc.stderr, MAX_CONVERSION_LOG_CHARS) self.proc.stderr.close() log.info( diff --git a/dangerzone/isolation_provider/container.py b/dangerzone/isolation_provider/container.py index afa8ea2db..8875210af 100644 --- a/dangerzone/isolation_provider/container.py +++ b/dangerzone/isolation_provider/container.py @@ -33,11 +33,6 @@ def __init__(self, container_tech: str) -> None: class Container(IsolationProvider): # Name of the dangerzone container CONTAINER_NAME = "dangerzone.rocks/dangerzone" - STARTUP_TIME_SECONDS = 5 - - def __init__(self, enable_timeouts: bool) -> None: - self.enable_timeouts = 1 if enable_timeouts else 0 - super().__init__() @staticmethod def get_runtime_name() -> str: @@ -225,8 +220,6 @@ def pixels_to_pdf( f"OCR={0 if ocr_lang is None else 1}", "-e", f"OCR_LANGUAGE={ocr_lang}", - "-e", - f"ENABLE_TIMEOUTS={self.enable_timeouts}", ] pixels_to_pdf_proc = self.exec_container(command, extra_args) @@ -254,14 +247,10 @@ def start_doc_to_pixels_proc(self) -> subprocess.Popen: "-m", "dangerzone.conversion.doc_to_pixels", ] - extra_args = [ - "-e", - f"ENABLE_TIMEOUTS={self.enable_timeouts}", - ] - return self.exec_container(command, extra_args) + return self.exec_container(command) def get_max_parallel_conversions(self) -> int: - # FIXME hardcoded 1 until timeouts are more limited and better handled + # FIXME hardcoded 1 until lenght conversions are better handled # https://github.com/freedomofpress/dangerzone/issues/257 return 1 diff --git a/dangerzone/isolation_provider/qubes.py b/dangerzone/isolation_provider/qubes.py index f778afebc..4c098f775 100644 --- a/dangerzone/isolation_provider/qubes.py +++ b/dangerzone/isolation_provider/qubes.py @@ -23,8 +23,6 @@ class Qubes(IsolationProvider): """Uses a disposable qube for performing the conversion""" - STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes - def __init__(self) -> None: self.proc: Optional[subprocess.Popen] = None super().__init__() diff --git a/tests/isolation_provider/test_container.py b/tests/isolation_provider/test_container.py index 3bef55c98..a6de19b9a 100644 --- a/tests/isolation_provider/test_container.py +++ b/tests/isolation_provider/test_container.py @@ -22,7 +22,7 @@ @pytest.fixture def provider() -> Container: - return Container(enable_timeouts=False) + return Container() class TestContainer(IsolationProviderTest): diff --git a/tests/test_ocr.py b/tests/test_ocr.py index dc23ea505..d7f03706e 100644 --- a/tests/test_ocr.py +++ b/tests/test_ocr.py @@ -49,7 +49,7 @@ def test_ocr_ommisions() -> None: installed_langs -= {"osd", "equ"} # Grab the languages that Dangerzone offers to the user through the GUI/CLI. - offered_langs = set(DangerzoneCore(Container(True)).ocr_languages.values()) + offered_langs = set(DangerzoneCore(Container()).ocr_languages.values()) # Ensure that both the installed languages and the ones we offer to the user are the # same.