Skip to content

Commit

Permalink
Remove timeouts
Browse files Browse the repository at this point in the history
Remove timeouts due to several reasons:

1. Lost purpose: after implementing the containers page streaming the
   only subprocess we have left is LibreOffice. So don't have such a
   big risk of commands hanging (the original reason for timeouts).

2. Little benefit: predicting execution time is generically unsolvable
   computer science problem. Ultimately we were guessing an arbitrary
   time based on the number of pages and the document size. As a guess
   we made it pretty lax (30s per page or MB). A document hanging for
   this long will probably lead to user frustration in any case and the
   user may be compelled to abort the conversion.

3. Technical Challenges with non-blocking timeout: there have been
several technical challenges in keeping timeouts that we've made effort
to accommodate. A significant one was having to do non-blocking read to
ensure we could timeout when reading conversion stream (and then used
here)

Fixes #687
  • Loading branch information
deeplow committed Jan 25, 2024
1 parent a8e2ccc commit 773633b
Show file tree
Hide file tree
Showing 10 changed files with 31 additions and 100 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ since 0.4.1, and this project adheres to [Semantic Versioning](https://semver.or

- Feature: Add support for HWP/HWPX files (Hancom Office) for macOS Apple Silicon devices ([issue #498](https://github.com/freedomofpress/dangerzone/issues/498), thanks to [@OctopusET](https://github.com/OctopusET))
- Replace Dangerzone document rendering engine from pdftoppm PyMuPDF, essentially replacing a variety of tools (gm / tesseract / pdfunite / ps2pdf) ([issue #658](https://github.com/freedomofpress/dangerzone/issues/658))
- Removed timeouts ([issue #687](https://github.com/freedomofpress/dangerzone/issues/687))

## Dangerzone 0.5.1

Expand Down
6 changes: 3 additions & 3 deletions dangerzone/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ def print_header(s: str) -> None:
)
@click.option(
"--enable-timeouts / --disable-timeouts",
default=True,
default=False,
show_default=True,
help="Enable/Disable timeouts during document conversion",
help="(DEPRECATED - timeouts were removed) Enable/Disable timeouts during document conversion",
)
@click.argument(
"filenames",
Expand All @@ -67,7 +67,7 @@ def cli_main(
elif is_qubes_native_conversion():
dangerzone = DangerzoneCore(Qubes())
else:
dangerzone = DangerzoneCore(Container(enable_timeouts=enable_timeouts))
dangerzone = DangerzoneCore(Container())

display_banner()
if len(filenames) == 1 and output_filename:
Expand Down
49 changes: 5 additions & 44 deletions dangerzone/conversion/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,39 +12,15 @@
from abc import abstractmethod
from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union

TIMEOUT_PER_PAGE: float = 30 # (seconds)
TIMEOUT_PER_MB: float = 30 # (seconds)
TIMEOUT_MIN: float = 60 # (seconds)
DEFAULT_DPI = 150 # Pixels per inch
INT_BYTES = 2


def running_on_qubes() -> bool:
# https://www.qubes-os.org/faq/#what-is-the-canonical-way-to-detect-qubes-vm
return os.path.exists("/usr/share/qubes/marker-vm")


def calculate_timeout(size: float, pages: Optional[float] = None) -> float:
"""Calculate the timeout for a command.
The timeout calculation takes two factors in mind:
1. The size (in MiBs) of the dataset (document, multiple pages).
2. The number of pages in the dataset.
It then calculates proportional timeout values based on the above, and keeps the
large one. This way, we can handle several corner cases:
* Documents with lots of pages, but small file size.
* Single images with large file size.
"""
# Do not have timeouts lower than 10 seconds, if the file size is small, since
# we need to take into account the program's startup time as well.
timeout = max(TIMEOUT_PER_MB * size, TIMEOUT_MIN)
if pages:
timeout = max(timeout, TIMEOUT_PER_PAGE * pages)
return timeout


def get_tessdata_dir() -> str:
if running_on_qubes():
return "/usr/share/tesseract/tessdata/"
Expand Down Expand Up @@ -76,7 +52,7 @@ def _write_text(cls, text: str, file: TextIO = sys.stdout) -> None:

@classmethod
def _write_int(cls, num: int, file: TextIO = sys.stdout) -> None:
cls._write_bytes(num.to_bytes(2, "big", signed=False), file=file)
cls._write_bytes(num.to_bytes(INT_BYTES, "big", signed=False), file=file)

# ==== ASYNC METHODS ====
# We run sync methods in async wrappers, because pure async methods are more difficult:
Expand Down Expand Up @@ -127,8 +103,6 @@ async def run_command(
args: List[str],
*,
error_message: str,
timeout_message: Optional[str] = None,
timeout: Optional[float] = None,
stdout_callback: Optional[Callable] = None,
stderr_callback: Optional[Callable] = None,
) -> Tuple[bytes, bytes]:
Expand All @@ -138,7 +112,6 @@ async def run_command(
output in bytes.
:raises RuntimeError: if the process returns a non-zero exit status
:raises TimeoutError: if the process times out
"""
# Start the provided command, and return a handle. The command will run in the
# background.
Expand All @@ -164,12 +137,9 @@ async def run_command(
self.read_stream(proc.stderr, stderr_callback)
)

# Wait until the command has finished, for a specific timeout. Then, verify that the
# command has completed successfully. In any other case, raise an exception.
try:
ret = await asyncio.wait_for(proc.wait(), timeout=timeout)
except asyncio.exceptions.TimeoutError:
raise TimeoutError(timeout_message)
# Wait until the command has finished. Then, verify that the command
# has completed successfully. In any other case, raise an exception.
ret = await asyncio.wait_for(proc.wait(), timeout=None)
if ret != 0:
raise RuntimeError(error_message)

Expand All @@ -179,15 +149,6 @@ async def run_command(
stderr = await stderr_task
return (stdout, stderr)

def calculate_timeout(
self, size: float, pages: Optional[float] = None
) -> Optional[float]:
"""Calculate the timeout for a command."""
if not int(os.environ.get("ENABLE_TIMEOUTS", 1)):
return None

return calculate_timeout(size, pages)

@abstractmethod
async def convert(self) -> None:
pass
Expand Down
7 changes: 1 addition & 6 deletions dangerzone/conversion/pixels_to_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ async def convert(
# The first few operations happen on a per-page basis.
page_size = len(untrusted_rgb_data)
total_size += page_size
timeout = self.calculate_timeout(page_size, 1)
pixmap = fitz.Pixmap(
fitz.Colorspace(fitz.CS_RGB), width, height, untrusted_rgb_data, False
)
Expand All @@ -75,10 +74,6 @@ async def convert(
safe_doc.insert_pdf(fitz.open("pdf", page_pdf_bytes))
self.percentage += percentage_per_page

# Next operations apply to the all the pages, so we need to recalculate the
# timeout.
timeout = self.calculate_timeout(total_size, num_pages)

self.percentage = 100.0
self.update_progress("Safe PDF created")

Expand Down Expand Up @@ -110,7 +105,7 @@ async def main() -> int:
try:
await converter.convert(ocr_lang)
return 0
except (RuntimeError, TimeoutError, ValueError) as e:
except (RuntimeError, ValueError) as e:
converter.update_progress(str(e), error=True)
return 1

Expand Down
4 changes: 2 additions & 2 deletions dangerzone/gui/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def infer_os_color_mode(self) -> OSColorMode:
"--enable-timeouts / --disable-timeouts",
default=True,
show_default=True,
help="Enable/Disable timeouts during document conversion",
help="(DEPRECATED - timeouts were removed) Enable/Disable timeouts during document conversion",
)
@click.argument(
"filenames",
Expand Down Expand Up @@ -138,7 +138,7 @@ def gui_main(
qubes = Qubes()
dangerzone = DangerzoneGui(app, isolation_provider=qubes)
else:
container = Container(enable_timeouts=enable_timeouts)
container = Container()
dangerzone = DangerzoneGui(app, isolation_provider=container)

# Allow Ctrl-C to smoothly quit the program instead of throwing an exception
Expand Down
43 changes: 15 additions & 28 deletions dangerzone/isolation_provider/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from colorama import Fore, Style

from ..conversion import errors
from ..conversion.common import calculate_timeout
from ..conversion.common import INT_BYTES
from ..document import Document
from ..util import Stopwatch, nonblocking_read, replace_control_chars
from ..util import replace_control_chars

log = logging.getLogger(__name__)

Expand All @@ -23,34 +23,33 @@
PIXELS_TO_PDF_LOG_END = "----- PIXELS TO PDF LOG END -----"


def read_bytes(f: IO[bytes], size: int, timeout: float, exact: bool = True) -> bytes:
def read_bytes(f: IO[bytes], size: int, exact: bool = True) -> bytes:
"""Read bytes from a file-like object."""
buf = nonblocking_read(f, size, timeout)
buf = f.read(size)
if exact and len(buf) != size:
raise errors.ConverterProcException()
return buf


def read_int(f: IO[bytes], timeout: float) -> int:
def read_int(f: IO[bytes]) -> int:
"""Read 2 bytes from a file-like object, and decode them as int."""
untrusted_int = read_bytes(f, 2, timeout)
untrusted_int = f.read(INT_BYTES)
if len(untrusted_int) != INT_BYTES:
raise errors.ConverterProcException()
return int.from_bytes(untrusted_int, "big", signed=False)


def read_debug_text(f: IO[bytes], size: int) -> str:
"""Read arbitrarily long text (for debug purposes)"""
timeout = calculate_timeout(size)
untrusted_text = read_bytes(f, size, timeout, exact=False)
return untrusted_text.decode("ascii", errors="replace")
untrusted_text = f.read(size).decode("ascii", errors="replace")
return replace_control_chars(untrusted_text)


class IsolationProvider(ABC):
"""
Abstracts an isolation provider
"""

STARTUP_TIME_SECONDS = 0 # The maximum time it takes a the provider to start up.

def __init__(self) -> None:
self.percentage = 0.0
self.proc: Optional[subprocess.Popen] = None
Expand Down Expand Up @@ -105,28 +104,18 @@ def doc_to_pixels(self, document: Document, tempdir: str) -> None:
except BrokenPipeError as e:
raise errors.ConverterProcException()

# Get file size (in MiB)
size = os.path.getsize(document.input_filename) / 1024**2
timeout = calculate_timeout(size) + self.STARTUP_TIME_SECONDS

assert self.proc is not None
assert self.proc.stdout is not None
os.set_blocking(self.proc.stdout.fileno(), False)

n_pages = read_int(self.proc.stdout, timeout)
assert self.proc.stdout
n_pages = read_int(self.proc.stdout)
if n_pages == 0 or n_pages > errors.MAX_PAGES:
raise errors.MaxPagesException()
percentage_per_page = 50.0 / n_pages

timeout = calculate_timeout(size, n_pages)
sw = Stopwatch(timeout)
sw.start()
for page in range(1, n_pages + 1):
text = f"Converting page {page}/{n_pages} to pixels"
self.print_progress(document, False, text, self.percentage)

width = read_int(self.proc.stdout, timeout=sw.remaining)
height = read_int(self.proc.stdout, timeout=sw.remaining)
width = read_int(self.proc.stdout)
height = read_int(self.proc.stdout)
if not (1 <= width <= errors.MAX_PAGE_WIDTH):
raise errors.MaxPageWidthException()
if not (1 <= height <= errors.MAX_PAGE_HEIGHT):
Expand All @@ -136,7 +125,6 @@ def doc_to_pixels(self, document: Document, tempdir: str) -> None:
untrusted_pixels = read_bytes(
self.proc.stdout,
num_pixels,
timeout=sw.remaining,
)

# Wrapper code
Expand All @@ -157,8 +145,7 @@ def doc_to_pixels(self, document: Document, tempdir: str) -> None:
self.print_progress(document, False, text, self.percentage)

if getattr(sys, "dangerzone_dev", False):
assert self.proc.stderr is not None
os.set_blocking(self.proc.stderr.fileno(), False)
assert self.proc.stderr
untrusted_log = read_debug_text(self.proc.stderr, MAX_CONVERSION_LOG_CHARS)
self.proc.stderr.close()
log.info(
Expand Down
15 changes: 2 additions & 13 deletions dangerzone/isolation_provider/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,6 @@ def __init__(self, container_tech: str) -> None:
class Container(IsolationProvider):
# Name of the dangerzone container
CONTAINER_NAME = "dangerzone.rocks/dangerzone"
STARTUP_TIME_SECONDS = 5

def __init__(self, enable_timeouts: bool) -> None:
self.enable_timeouts = 1 if enable_timeouts else 0
super().__init__()

@staticmethod
def get_runtime_name() -> str:
Expand Down Expand Up @@ -225,8 +220,6 @@ def pixels_to_pdf(
f"OCR={0 if ocr_lang is None else 1}",
"-e",
f"OCR_LANGUAGE={ocr_lang}",
"-e",
f"ENABLE_TIMEOUTS={self.enable_timeouts}",
]

pixels_to_pdf_proc = self.exec_container(command, extra_args)
Expand Down Expand Up @@ -254,14 +247,10 @@ def start_doc_to_pixels_proc(self) -> subprocess.Popen:
"-m",
"dangerzone.conversion.doc_to_pixels",
]
extra_args = [
"-e",
f"ENABLE_TIMEOUTS={self.enable_timeouts}",
]
return self.exec_container(command, extra_args)
return self.exec_container(command)

def get_max_parallel_conversions(self) -> int:
# FIXME hardcoded 1 until timeouts are more limited and better handled
# FIXME hardcoded 1 until lenght conversions are better handled
# https://github.com/freedomofpress/dangerzone/issues/257
return 1

Expand Down
2 changes: 0 additions & 2 deletions dangerzone/isolation_provider/qubes.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
class Qubes(IsolationProvider):
"""Uses a disposable qube for performing the conversion"""

STARTUP_TIME_SECONDS = 5 * 60 # 5 minutes

def __init__(self) -> None:
self.proc: Optional[subprocess.Popen] = None
super().__init__()
Expand Down
2 changes: 1 addition & 1 deletion tests/isolation_provider/test_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

@pytest.fixture
def provider() -> Container:
return Container(enable_timeouts=False)
return Container()


class TestContainer(IsolationProviderTest):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_ocr_ommisions() -> None:
installed_langs -= {"osd", "equ"}

# Grab the languages that Dangerzone offers to the user through the GUI/CLI.
offered_langs = set(DangerzoneCore(Container(True)).ocr_languages.values())
offered_langs = set(DangerzoneCore(Container()).ocr_languages.values())

# Ensure that both the installed languages and the ones we offer to the user are the
# same.
Expand Down

0 comments on commit 773633b

Please sign in to comment.