From 492a891323704c7315cb0f3d146f03c151b8314a Mon Sep 17 00:00:00 2001 From: Rico Huijbers Date: Mon, 14 Jun 2021 18:37:19 +0200 Subject: [PATCH] chore: pickle YAML files when we're in production (#489) Pickling is ~400x faster than loading the YAML file directly, so this saves a bunch of seconds once on every restart. Could be even better if we pre-process the files once before deploy, but this is a nice start. This also fixes some otherwise potentially broken unit tests... --- .gitignore | 2 + tests/test_utils.py | 18 +++++++++ utils.py | 90 ++++++++++++++++++++++++++++++++++++++++----- website/querylog.py | 7 ++++ 4 files changed, 108 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 0fe92a9e826..f4f02298bbd 100644 --- a/.gitignore +++ b/.gitignore @@ -149,3 +149,5 @@ node_modules # Ignore things that start with underscores _* +# Ignore pickled versions of YAML files +*.yaml.pickle diff --git a/tests/test_utils.py b/tests/test_utils.py index 6441adca547..6afaf598ed1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,7 @@ import unittest import utils import bcrypt +import time class TestUtils(unittest.TestCase): def test_slashjoin(self): @@ -15,3 +16,20 @@ def test_extract_rounds(self): def test_extract_default_rounds(self): salt = bcrypt.gensalt ().decode ('utf-8') self.assertEqual(12, utils.extract_bcrypt_rounds(salt)) + + def test_load_yaml_speed(self): + n = 50 + file = 'coursedata/adventures/hu.yaml' + + start = time.time() + for _ in range(n): + original_data = utils.load_yaml_uncached(file) + original_seconds = time.time() - start + + start = time.time() + for _ in range(n): + cached_data = utils.load_yaml_pickled(file) + cached_seconds = time.time() - start + + self.assertEqual(original_data, cached_data) + print(f'YAML loading takes {original_seconds / n} seconds, unpickling takes {cached_seconds / n} ({original_seconds / cached_seconds}x faster)') \ No newline at end of file diff --git a/utils.py b/utils.py index a456478b741..abb9f3497c5 100644 --- a/utils.py +++ b/utils.py @@ -1,6 +1,8 @@ +import contextlib import datetime import time from config import config +import pickle import boto3 import functools import os @@ -9,6 +11,9 @@ from website import querylog +IS_WINDOWS = os.name == 'nt' + + class Timer: """A quick and dirty timer.""" def __init__(self, name): @@ -83,19 +88,61 @@ def set_debug_mode(debug_mode): def load_yaml(filename): """Load the given YAML file. - The file load will be cached in production, but reloaded everytime in development mode. + The file load will be cached in production, but reloaded everytime in + development mode for much iterating. Because YAML loading is still + somewhat slow, in production we'll have two levels of caching: + + - In-memory cache: each of the N processes on the box will only need to + load the YAML file once (per restart). - Whether we are running in production or not will be determined - by the Flask config (FLASK_ENV). + - On-disk pickle cache: "pickle" is a more efficient Python serialization + format, and loads 400x quicker than YAML. We will prefer loading a pickle + file to loading the source YAML file if possible. Hopefully only 1/N + processes on the box will have to do the full load per deploy. + + We should be generating the pickled files at build time, but Heroku doesn't + make it easy to have a build/deploy time... so for now let's just make sure + we only do it once per box per deploy. """ - # Bypass the cache in DEBUG mode for mucho iterating - if not is_debug_mode() and filename in YAML_CACHE: + if is_debug_mode(): + return load_yaml_uncached(filename) + + # Production mode, check our two-level cache + if filename not in YAML_CACHE: + data = load_yaml_pickled(filename) + YAML_CACHE[filename] = data + return data + else: return YAML_CACHE[filename] + + +def load_yaml_pickled(filename): + # Let's not even attempt the pickling on Windows, because we have + # no pattern to atomatically write the pickled result file. + if IS_WINDOWS: + return load_yaml_uncached(filename) + + pickle_file = f'{filename}.pickle' + if not os.path.exists(pickle_file): + data = load_yaml_uncached(filename) + + # Write a pickle file, first write to a tempfile then rename + # into place because multiple processes might try to do this in parallel, + # plus we only want `path.exists(pickle_file)` to return True once the + # file is actually complete and readable. + with atomic_write_file(pickle_file) as f: + pickle.dump(data, f) + + return data + else: + with open(pickle_file, 'rb') as f: + return pickle.load(f) + + +def load_yaml_uncached(filename): try: - with open (filename, 'r', encoding='utf-8') as f: - data = yaml.safe_load(f) - YAML_CACHE[filename] = data - return data + with open(filename, 'r', encoding='utf-8') as f: + return yaml.safe_load(f) except IOError: return {} @@ -325,3 +372,28 @@ def version(): def valid_email(s): return bool (re.match ('^(([a-zA-Z0-9_+\.\-]+)@([\da-zA-Z\.\-]+)\.([a-zA-Z\.]{2,6})\s*)$', s)) + + +@contextlib.contextmanager +def atomic_write_file(filename, mode='wb'): + """Write to a filename atomically. + + First write to a unique tempfile, then rename the tempfile into + place. Use as a context manager: + + with atomic_write_file('file.txt') as f: + f.write('hello') + + THIS WON'T WORK ON WINDOWS -- atomic file renames don't overwrite + on Windows. We could potentially do something else to make it work + (just swallow the exception, someone else already wrote the file?) + but for now we just don't support it. + """ + if IS_WINDOWS: + raise RuntimeError('Cannot use atomic_write_file() on Windows!') + + tmp_file = f'{filename}.{os.getpid()}' + with open(tmp_file, mode) as f: + yield f + + os.rename(tmp_file, filename) \ No newline at end of file diff --git a/website/querylog.py b/website/querylog.py index bff0c66d4bb..5f8d04ae35f 100644 --- a/website/querylog.py +++ b/website/querylog.py @@ -97,6 +97,13 @@ class NullRecord(LogRecord): def __init__(self, **kwargs): pass def finish(self): pass def set(self, **kwargs): pass + def _remember_timer(self, _): pass + def _forget_timer(self, _): pass + def _terminate_running_timers(self): pass + def inc_timer(self, _, _2): pass + + def record_exception(self, exc): + self.set(fault=1, error_message=str(exc)) THREAD_LOCAL = threading.local()