Skip to content

Commit

Permalink
chore: pickle YAML files when we're in production (hedyorg#489)
Browse files Browse the repository at this point in the history
Pickling is ~400x faster than loading the YAML file directly,
so this saves a bunch of seconds once on every restart.

Could be even better if we pre-process the files once before
deploy, but this is a nice start.

This also fixes some otherwise potentially broken unit tests...
  • Loading branch information
rix0rrr authored Jun 14, 2021
1 parent e3fc0eb commit 492a891
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 9 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,5 @@ node_modules

# Ignore things that start with underscores
_*
# Ignore pickled versions of YAML files
*.yaml.pickle
18 changes: 18 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest
import utils
import bcrypt
import time

class TestUtils(unittest.TestCase):
def test_slashjoin(self):
Expand All @@ -15,3 +16,20 @@ def test_extract_rounds(self):
def test_extract_default_rounds(self):
salt = bcrypt.gensalt ().decode ('utf-8')
self.assertEqual(12, utils.extract_bcrypt_rounds(salt))

def test_load_yaml_speed(self):
n = 50
file = 'coursedata/adventures/hu.yaml'

start = time.time()
for _ in range(n):
original_data = utils.load_yaml_uncached(file)
original_seconds = time.time() - start

start = time.time()
for _ in range(n):
cached_data = utils.load_yaml_pickled(file)
cached_seconds = time.time() - start

self.assertEqual(original_data, cached_data)
print(f'YAML loading takes {original_seconds / n} seconds, unpickling takes {cached_seconds / n} ({original_seconds / cached_seconds}x faster)')
90 changes: 81 additions & 9 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import contextlib
import datetime
import time
from config import config
import pickle
import boto3
import functools
import os
Expand All @@ -9,6 +11,9 @@
from website import querylog


IS_WINDOWS = os.name == 'nt'


class Timer:
"""A quick and dirty timer."""
def __init__(self, name):
Expand Down Expand Up @@ -83,19 +88,61 @@ def set_debug_mode(debug_mode):
def load_yaml(filename):
"""Load the given YAML file.
The file load will be cached in production, but reloaded everytime in development mode.
The file load will be cached in production, but reloaded everytime in
development mode for much iterating. Because YAML loading is still
somewhat slow, in production we'll have two levels of caching:
- In-memory cache: each of the N processes on the box will only need to
load the YAML file once (per restart).
Whether we are running in production or not will be determined
by the Flask config (FLASK_ENV).
- On-disk pickle cache: "pickle" is a more efficient Python serialization
format, and loads 400x quicker than YAML. We will prefer loading a pickle
file to loading the source YAML file if possible. Hopefully only 1/N
processes on the box will have to do the full load per deploy.
We should be generating the pickled files at build time, but Heroku doesn't
make it easy to have a build/deploy time... so for now let's just make sure
we only do it once per box per deploy.
"""
# Bypass the cache in DEBUG mode for mucho iterating
if not is_debug_mode() and filename in YAML_CACHE:
if is_debug_mode():
return load_yaml_uncached(filename)

# Production mode, check our two-level cache
if filename not in YAML_CACHE:
data = load_yaml_pickled(filename)
YAML_CACHE[filename] = data
return data
else:
return YAML_CACHE[filename]


def load_yaml_pickled(filename):
# Let's not even attempt the pickling on Windows, because we have
# no pattern to atomatically write the pickled result file.
if IS_WINDOWS:
return load_yaml_uncached(filename)

pickle_file = f'{filename}.pickle'
if not os.path.exists(pickle_file):
data = load_yaml_uncached(filename)

# Write a pickle file, first write to a tempfile then rename
# into place because multiple processes might try to do this in parallel,
# plus we only want `path.exists(pickle_file)` to return True once the
# file is actually complete and readable.
with atomic_write_file(pickle_file) as f:
pickle.dump(data, f)

return data
else:
with open(pickle_file, 'rb') as f:
return pickle.load(f)


def load_yaml_uncached(filename):
try:
with open (filename, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
YAML_CACHE[filename] = data
return data
with open(filename, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except IOError:
return {}

Expand Down Expand Up @@ -325,3 +372,28 @@ def version():

def valid_email(s):
return bool (re.match ('^(([a-zA-Z0-9_+\.\-]+)@([\da-zA-Z\.\-]+)\.([a-zA-Z\.]{2,6})\s*)$', s))


@contextlib.contextmanager
def atomic_write_file(filename, mode='wb'):
"""Write to a filename atomically.
First write to a unique tempfile, then rename the tempfile into
place. Use as a context manager:
with atomic_write_file('file.txt') as f:
f.write('hello')
THIS WON'T WORK ON WINDOWS -- atomic file renames don't overwrite
on Windows. We could potentially do something else to make it work
(just swallow the exception, someone else already wrote the file?)
but for now we just don't support it.
"""
if IS_WINDOWS:
raise RuntimeError('Cannot use atomic_write_file() on Windows!')

tmp_file = f'{filename}.{os.getpid()}'
with open(tmp_file, mode) as f:
yield f

os.rename(tmp_file, filename)
7 changes: 7 additions & 0 deletions website/querylog.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ class NullRecord(LogRecord):
def __init__(self, **kwargs): pass
def finish(self): pass
def set(self, **kwargs): pass
def _remember_timer(self, _): pass
def _forget_timer(self, _): pass
def _terminate_running_timers(self): pass
def inc_timer(self, _, _2): pass

def record_exception(self, exc):
self.set(fault=1, error_message=str(exc))


THREAD_LOCAL = threading.local()
Expand Down

0 comments on commit 492a891

Please sign in to comment.