From 492a891323704c7315cb0f3d146f03c151b8314a Mon Sep 17 00:00:00 2001
From: Rico Huijbers <rix0rrr@gmail.com>
Date: Mon, 14 Jun 2021 18:37:19 +0200
Subject: [PATCH] chore: pickle YAML files when we're in production (#489)

Pickling is ~400x faster than loading the YAML file directly,
so this saves a bunch of seconds once on every restart.

Could be even better if we pre-process the files once before
deploy, but this is a nice start.

This also fixes some otherwise potentially broken unit tests...
---
 .gitignore          |  2 +
 tests/test_utils.py | 18 +++++++++
 utils.py            | 90 ++++++++++++++++++++++++++++++++++++++++-----
 website/querylog.py |  7 ++++
 4 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0fe92a9e826..f4f02298bbd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -149,3 +149,5 @@ node_modules
 
 # Ignore things that start with underscores
 _*
+# Ignore pickled versions of YAML files
+*.yaml.pickle
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 6441adca547..6afaf598ed1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,7 @@
 import unittest
 import utils
 import bcrypt
+import time
 
 class TestUtils(unittest.TestCase):
   def test_slashjoin(self):
@@ -15,3 +16,20 @@ def test_extract_rounds(self):
   def test_extract_default_rounds(self):
     salt = bcrypt.gensalt ().decode ('utf-8')
     self.assertEqual(12, utils.extract_bcrypt_rounds(salt))
+
+  def test_load_yaml_speed(self):
+    n = 50
+    file = 'coursedata/adventures/hu.yaml'
+
+    start = time.time()
+    for _ in range(n):
+      original_data = utils.load_yaml_uncached(file)
+    original_seconds = time.time() - start
+
+    start = time.time()
+    for _ in range(n):
+      cached_data = utils.load_yaml_pickled(file)
+    cached_seconds = time.time() - start
+
+    self.assertEqual(original_data, cached_data)
+    print(f'YAML loading takes {original_seconds / n} seconds, unpickling takes {cached_seconds / n} ({original_seconds / cached_seconds}x faster)')
\ No newline at end of file
diff --git a/utils.py b/utils.py
index a456478b741..abb9f3497c5 100644
--- a/utils.py
+++ b/utils.py
@@ -1,6 +1,8 @@
+import contextlib
 import datetime
 import time
 from config import config
+import pickle
 import boto3
 import functools
 import os
@@ -9,6 +11,9 @@
 from website import querylog
 
 
+IS_WINDOWS = os.name == 'nt'
+
+
 class Timer:
   """A quick and dirty timer."""
   def __init__(self, name):
@@ -83,19 +88,61 @@ def set_debug_mode(debug_mode):
 def load_yaml(filename):
     """Load the given YAML file.
 
-    The file load will be cached in production, but reloaded everytime in development mode.
+    The file load will be cached in production, but reloaded everytime in
+    development mode for much iterating. Because YAML loading is still
+    somewhat slow, in production we'll have two levels of caching:
+
+    - In-memory cache: each of the N processes on the box will only need to
+      load the YAML file once (per restart).
 
-    Whether we are running in production or not will be determined
-    by the Flask config (FLASK_ENV).
+    - On-disk pickle cache: "pickle" is a more efficient Python serialization
+      format, and loads 400x quicker than YAML. We will prefer loading a pickle
+      file to loading the source YAML file if possible. Hopefully only 1/N
+      processes on the box will have to do the full load per deploy.
+
+    We should be generating the pickled files at build time, but Heroku doesn't
+    make it easy to have a build/deploy time... so for now let's just make sure
+    we only do it once per box per deploy.
     """
-    # Bypass the cache in DEBUG mode for mucho iterating
-    if not is_debug_mode() and filename in YAML_CACHE:
+    if is_debug_mode():
+        return load_yaml_uncached(filename)
+
+    # Production mode, check our two-level cache
+    if filename not in YAML_CACHE:
+        data = load_yaml_pickled(filename)
+        YAML_CACHE[filename] = data
+        return data
+    else:
         return YAML_CACHE[filename]
+
+
+def load_yaml_pickled(filename):
+    # Let's not even attempt the pickling on Windows, because we have
+    # no pattern to atomatically write the pickled result file.
+    if IS_WINDOWS:
+        return load_yaml_uncached(filename)
+
+    pickle_file = f'{filename}.pickle'
+    if not os.path.exists(pickle_file):
+        data = load_yaml_uncached(filename)
+
+        # Write a pickle file, first write to a tempfile then rename
+        # into place because multiple processes might try to do this in parallel,
+        # plus we only want `path.exists(pickle_file)` to return True once the
+        # file is actually complete and readable.
+        with atomic_write_file(pickle_file) as f:
+            pickle.dump(data, f)
+
+        return data
+    else:
+        with open(pickle_file, 'rb') as f:
+            return pickle.load(f)
+
+
+def load_yaml_uncached(filename):
     try:
-        with open (filename, 'r', encoding='utf-8') as f:
-            data = yaml.safe_load(f)
-            YAML_CACHE[filename] = data
-            return data
+        with open(filename, 'r', encoding='utf-8') as f:
+            return yaml.safe_load(f)
     except IOError:
         return {}
 
@@ -325,3 +372,28 @@ def version():
 
 def valid_email(s):
     return bool (re.match ('^(([a-zA-Z0-9_+\.\-]+)@([\da-zA-Z\.\-]+)\.([a-zA-Z\.]{2,6})\s*)$', s))
+
+
+@contextlib.contextmanager
+def atomic_write_file(filename, mode='wb'):
+    """Write to a filename atomically.
+
+    First write to a unique tempfile, then rename the tempfile into
+    place. Use as a context manager:
+
+        with atomic_write_file('file.txt') as f:
+            f.write('hello')
+
+    THIS WON'T WORK ON WINDOWS -- atomic file renames don't overwrite
+    on Windows. We could potentially do something else to make it work
+    (just swallow the exception, someone else already wrote the file?)
+    but for now we just don't support it.
+    """
+    if IS_WINDOWS:
+        raise RuntimeError('Cannot use atomic_write_file() on Windows!')
+
+    tmp_file = f'{filename}.{os.getpid()}'
+    with open(tmp_file, mode) as f:
+        yield f
+
+    os.rename(tmp_file, filename)
\ No newline at end of file
diff --git a/website/querylog.py b/website/querylog.py
index bff0c66d4bb..5f8d04ae35f 100644
--- a/website/querylog.py
+++ b/website/querylog.py
@@ -97,6 +97,13 @@ class NullRecord(LogRecord):
     def __init__(self, **kwargs): pass
     def finish(self): pass
     def set(self, **kwargs): pass
+    def _remember_timer(self, _): pass
+    def _forget_timer(self, _): pass
+    def _terminate_running_timers(self): pass
+    def inc_timer(self, _, _2): pass
+
+    def record_exception(self, exc):
+        self.set(fault=1, error_message=str(exc))
 
 
 THREAD_LOCAL = threading.local()