Initial benchmarks for caching. Closes hasura#3530

These aren't suitable e.g. for running in CI since some take far too long (and an impossibly long-time when running under criterion's normal bootstrapping sampling regime. We might try to improve this ourselves: haskell/criterion#218 An initial summary analysis will be in hasura#3530.
jberryman · Dec 13, 2019 · 7adc834 · 7adc834
1 parent 9c3183b
commit 7adc834
Show file tree

Hide file tree

Showing 4 changed files with 324 additions and 7 deletions.
diff --git a/server/graphql-engine.cabal b/server/graphql-engine.cabal
@@ -186,6 +186,10 @@ library
                      , Hasura.Prelude
                      , Hasura.App
                      , Hasura.Db
+                     -- Exposed for benchmark:
+                     , Hasura.Cache.Bounded
+                     , Hasura.Cache.Unbounded
+                     , Hasura.Cache
                      , Hasura.Logging
                      , Hasura.HTTP
 
@@ -205,12 +209,7 @@ library
                      , Hasura.RQL.DDL.Metadata
                      , Hasura.EncJSON
 
-  other-modules:       Hasura.Cache
-                     , Hasura.Cache
-                     , Hasura.Cache.Bounded
-                     , Hasura.Cache.Unbounded
-
-                     , Hasura.Server.Auth.JWT
+  other-modules:       Hasura.Server.Auth.JWT
                      , Hasura.Server.Middleware
                      , Hasura.Server.Cors
                      , Hasura.Server.CheckUpdates
@@ -361,3 +360,23 @@ test-suite graphql-engine-tests
     , optparse-applicative
     , pg-client
     , time
+
+-- Benchmarks related to caching (e.g. the plan cache).
+--
+-- NOTE: Some of these are very slow and can only be reasonably run with `cache -n 1` for now.
+benchmark cache
+  import: common-all, common-exe
+  type: exitcode-stdio-1.0
+  main-is: Main.hs
+  hs-source-dirs: src-bench-cache
+  build-depends:     base
+                   , criterion
+                   , mwc-random
+                   , mwc-probability
+                   , vector
+                   , deepseq
+                   , graphql-engine
+                   , split
+                   , async
+                   , text
+                   , bytestring
diff --git a/server/src-bench-cache/Main.hs b/server/src-bench-cache/Main.hs
@@ -0,0 +1,281 @@
+{-# OPTIONS_GHC -fno-warn-orphans #-} 
+module Main where
+
+import Prelude
+import Criterion.Main
+import Data.Word
+import Data.Bits
+import Data.IORef
+import Data.List
+import Data.List.Split (chunksOf)
+import Data.Ord
+import Data.Traversable
+import qualified Data.Vector as V
+import Control.Concurrent.Async (forConcurrently)
+import Control.Concurrent (getNumCapabilities)
+import Control.DeepSeq
+import Control.Monad (foldM)
+import GHC.Clock
+import System.Random.MWC as Rand
+import System.Random.MWC.Probability as Rand
+import qualified Hasura.Cache.Bounded   as B
+import qualified Hasura.Cache.Unbounded as U
+-- higher level interface to above, combined:
+import qualified Hasura.Cache as Cache
+
+-- Benchmarks for code backing the plan cache.
+
+main :: IO ()
+main = defaultMain [
+    -- simple lookup benchmarks at different capacities. Although reads are effectful
+    -- in Bounded cache, we don't expect this to cause drift in timings.
+    bgroup "lookup" [
+      readBenches 1
+    , readBenches 100
+      -- This is the maximum capacity for bounded at the moment. Make 1mil if
+      -- we increase this bound:
+    , readBenches 65535
+    ]
+    -- simple insert benchmark. Try to avoid drift by initialising fresh
+    -- and measuring 1000 inserts at a time.
+  , env (randomInts 1000) $ \ ~rs-> 
+      bgroup "insert x1000" [
+        -- use perRunEnv so we can be sure we're not triggering cache
+        -- evictions in bounded due to long bootstrap batch runs
+        bench "unbounded" $
+          perRunEnv (U.initialise) $ \cache ->
+            V.mapM_ (\k -> U.insert k k cache) rs
+      , bench "bounded" $
+          perRunEnv (B.initialise 4000) $ \cache ->
+            V.mapM_ (\k -> B.insert k k cache) rs
+      -- an eviction on each insert, all LRU counters at zero. Simulates a scan.
+      , bench "bounded evicting scan" $
+          let preloaded = populate 5000 (B.initialise 5000) B.insertAllStripes
+           in perRunEnv (preloaded) $ \(cache, _) ->
+                V.mapM_ (\k -> B.insert k k cache) rs
+      ]
+
+  ---- lookup+insert loops on realistic data, with a tunable cost of a cache
+  ---- miss.
+  --
+  -- No extra cost to a cache miss. This might be useful to remove noise or
+  -- enhance contention effects:
+  , realisticBenches "realistic requests x1000000, no miss cost" 0
+  -- Here we simulate generating a plan on a miss. The timing here was obtained
+  -- at commit b81d22f58 by measuring the average runtime difference in runGQ
+  -- when caching was enabled and disabled, for one particular query.
+  --
+  -- There are a lot of other valid numbers we could use here though, e.g. 1.5
+  -- ms was approximately the minimum cost; the avg was heavily skewed.
+  , realisticBenches "realistic requests x1000000, real plan gen cost" 200000 -- ~20ms
+  -- A number pulled out of the air, to give us a sense of what optimizing plan
+  -- generation might do for us in presence of caching:
+  , realisticBenches "realistic requests x1000000, optimized 1ms miss cost" 10000 -- ~1ms
+
+  , bgroup "misc" [
+      -- Is burnCycles valid as a tunable consistent workload?
+      bench "burnCycles x1" $ nfAppIO burnCycles 1
+    , bench "burnCycles x2" $ nfAppIO burnCycles 2
+    , bench "burnCycles x4" $ nfAppIO burnCycles 4
+    , bench "burnCycles x1000" $ nfAppIO burnCycles 1000
+    ]
+  ]
+
+-- | Simulate a realistic lookup+workload+insert loop on zipf-distributed data,
+-- with a tunable workload to simulate the cost of a cache miss.
+--
+-- NOTE: our hypothesis (that requests are power law distributed) might not be
+-- correct, or might be incorrect for some users. Or it might be that many
+-- users interact with hasura ONLY with parameterized queries with variables,
+-- where all of these fit into a fairly small cache (but where occurrences of
+-- these are zipf-distributed). (TODO It should be simple to adapt this to the latter
+-- case (just test on zipf Word8 domain), but these benchmarks don't seem very
+-- useful if we assume we effectively get only cache hits).
+--
+-- This might give us insight into:
+--   - Are stripes actually helpful?
+--   - Does contention cause issues (e.g. can we induce it w/ wrk=0?)? 
+--      Do we want a lockfree algorithm?
+--   - Is it worthwhile to try to improve performance of plan generation (here
+--      simulated by decreasing the 'wrk' parameter)?
+--   - Could we benefit from a more efficient lookup/insert loop e.g. by hashing only once?
+--   - What might be a good default cache size bound?
+--   - Different caching algorithms/schemes:
+--     - alternatives to LRU (although we don't intentionally simulate scans here)
+--     - caching with random probability to chop long tail
+--     - ...
+realisticBenches :: String -> Int -> Benchmark
+realisticBenches name wrk = 
+  bgroup name [
+    --  27K uniques, 97% in top 10%, 97% cache hits ideally
+    env (zipfianRandomInts 1000000 1.4) $ \ ~(payloads, _,_,_,_)->  -- EXPENSIVE!
+      bgroup "optimistic distribution" $
+        -- For oversubscribed case: can we see descheduled threads blocking global progress?
+        flip map [2,100] $ \threadsPerHEC ->
+          bgroup (show threadsPerHEC <>"xCPUs threads") [
+            bench "unbounded" $
+              perRunEnv (Cache.initialise $ Cache.mkCacheOptions Nothing) $ \cache ->
+                go threadsPerHEC cache payloads
+          , bench "bounded effectively unbounded" $
+              perRunEnv (Cache.initialise $ Cache.mkCacheOptions $ Just 40000) $ \cache ->
+                go threadsPerHEC cache payloads
+          , bench "bounded 10pct ideal capacity" $
+              perRunEnv (Cache.initialise $ Cache.mkCacheOptions $ Just 2700) $ \cache ->
+                go threadsPerHEC cache payloads
+          ]
+    --  660K uniques, 40% in top 10% , 30% in top 1%, 33% cache hits ideally
+  , env (zipfianRandomInts 1000000 1.01) $ \ ~(payloads, _,_,_,_)->  -- EXPENSIVE!
+      bgroup "realistic distribution" $
+        flip map [2,100] $ \threadsPerHEC ->
+          bgroup (show threadsPerHEC <>"xCPUs threads") [
+            bench "unbounded" $
+              perRunEnv (Cache.initialise $ Cache.mkCacheOptions Nothing) $ \cache ->
+                go threadsPerHEC cache payloads
+          , bench "bounded maxBound (10pct ideal capacity)" $
+              -- this is our largest possible cache size will necessarily evict
+              perRunEnv (Cache.initialise $ Cache.mkCacheOptions $ Just maxBound) $ \cache ->
+                go threadsPerHEC cache payloads
+          , bench "bounded 6000 (1pct ideal capacity)" $
+              perRunEnv (Cache.initialise $ Cache.mkCacheOptions $ Just 6000) $ \cache ->
+                go threadsPerHEC cache payloads
+          ]
+  ]
+  where 
+    go :: Int -> Cache.Cache Int Int -> [Int] -> IO ()
+    go threadFactor cache payload = do
+      bef <- getMonotonicTimeNSec
+      -- So that `go 0 ...`  will give us a single thread:
+      threads <- (+ 1) . (* threadFactor) <$> getNumCapabilities
+      -- each thread takes its own interleaved section of payload. Try to do
+      -- this work before forking.
+      let !localPayloads = force $
+             map (\tN -> map head $ chunksOf threads $ drop tN payload) [0..(threads-1)]
+      _hitsMisses <- forConcurrently localPayloads $ \payloadL -> do
+        foldM lookupInsertLoop (0,0) payloadL
+      aft <- getMonotonicTimeNSec
+      -- TODO we need to decide whether to rewrite these benchmarks or fix
+      -- criterion so it can support what I want here (to run a slow benchmark
+      -- perhaps one time, with an actual time limit).
+      --   We should also look into just generating a report by hand that takes
+      -- into account per-thread misses without actually simulating them with
+      -- burnCycles.
+      putStrLn $ "TIMING: " <>(show $ fromIntegral (aft-bef) / (1000*1000 :: Double)) <> "ms"
+      -- putStrLn $ "HITS/MISSES: "<> show _hitsMisses  -- DEBUGGING/FYI
+      return ()
+      where
+        lookupInsertLoop :: (Int, Int) -> Int -> IO (Int, Int)
+        lookupInsertLoop (!h, !m) p = do
+          Cache.lookup p cache >>= \case
+            -- happy path: item was in the cache:
+            Just !_ -> return (h+1, m)
+            -- sad path: Do some work to simulate cost of a cache miss before caching:
+            Nothing -> do
+              -- add some jitter to workload:
+              let jRange = wrk `div` 4 -- tunable
+                  -- assumes `p` is random:
+                  wrkJittered 
+                    | wrk == 0 = 0
+                    | otherwise = wrk + ((p `mod` jRange) - (jRange `div` 2))
+              !_ <- burnCycles wrkJittered
+              Cache.insert p p cache
+              return (h, m+1)
+
+
+
+-- | Do some work, that scales linearly proportional to N and hopefully won't
+-- be optimized away. We also make sure to allocate to ensure runtime can
+-- deschedule threads running this.
+--
+-- This is tuned to take 100ns on my machine.
+--
+-- NOTE: it would be nice (maybe) if we could just tell criterion that we want
+-- to fake some extra time added to a particular benchmark run.
+burnCycles :: Int -> IO Int
+{-# NOINLINE burnCycles #-}
+burnCycles = go 0XBEEF where
+  go !x !n 
+    | n <= 0 = return x
+    | otherwise = do
+        uselessRef <- newIORef x
+        let pureWork = 73 -- arbitrary, for fine-tuning
+            !x' = foldl' (\acc b-> (acc `xor` b) * 1099511628211) x [1..pureWork]
+        x'' <- readIORef uselessRef
+        go (x' `xor` x'') (n-1)
+
+
+
+readBenches :: Int -> Benchmark
+readBenches n = 
+  bgroup ("size "<>show n) [
+    env (populate n U.initialise U.insertAllStripes) $ \ ~(cache, k)->
+      bgroup "unbounded" [
+        bench "hit" $ 
+          nfAppIO (\k' -> U.lookup k' cache) k
+      , bench "miss" $ 
+          nfAppIO (\k' -> U.lookup k' cache) 0xDEAD
+      ]
+  , env (populate n (B.initialise (fromIntegral $ n*2)) B.insertAllStripes) $ \ ~(cache, k)->
+      bgroup "unbounded" [
+        bench "hit" $ 
+          nfAppIO (\k' -> B.lookup k' cache) k
+      , bench "miss" $ 
+          nfAppIO (\k' -> B.lookup k' cache) 0xDEAD
+      ]
+  ]
+
+
+-- return a randomly-populated cache, along with an item somewhere in the middle.
+-- We take care to use random keys since Hashable is untrustworthy.
+populate :: Int -> IO cache -> (Int -> Int -> cache -> IO b) -> IO (cache, Int)
+populate n _initialise _insertAllStripes = do
+  cache <- _initialise
+  rs <- randomInts n
+  mapM_ (\k -> _insertAllStripes k k cache) rs
+  let medianish = V.minimumBy (comparing abs) rs
+  return (cache, medianish)
+
+
+randomInts :: Int -> IO (V.Vector Int)
+randomInts n = 
+  withSystemRandom . asGenST $ \gen -> uniformVector gen n
+
+
+-- | Return a zipf-mandelbrot distributed list of 'n' Ints (the Ints themselves
+-- will be uniformly random, see 'randomInts'). The first parameter controls
+-- the skew. The two returned Double values are:
+--   - number of unique values in list (i.e. max cache residency)
+--   - number of cache hits assuming an unbounded cache, no races or striping
+--   - pct of samples falling into most frequent 1% bucket
+--   - pct of samples falling into most frequent 10% bucket
+--
+-- These can be used to try to pick some reasonable skew parameter (I'm not
+-- sure how to do that more scientifically).
+--
+-- This is slow, and as skew gets closer to 1 (e.g. 1.0001) this becomes very
+-- slow, which is a shame because these seem most realistic.
+zipfianRandomInts :: Int -> Double -> IO ([Int], Int, Int, Double, Double) 
+zipfianRandomInts n sk = do 
+  gen <- Rand.createSystemRandom
+  payloadVals <- randomInts $ 100*1000
+  zipfIxs <- Rand.samples n (Rand.zipf sk) gen :: IO [Word32]
+  let groupings = reverse $ sort $ map length $ group $ sort zipfIxs
+      uniqs = length groupings
+      top buckets = 
+        let inTop = sum $ take ((uniqs `div` buckets) + 1) groupings
+         in fromIntegral inTop / fromIntegral n :: Double
+      idealHits = sum $ map (subtract 1) groupings
+  payloads <- for zipfIxs $ \ix_w32 ->
+    case payloadVals V.!? fromIntegral ix_w32 of
+      -- we could generate a random val here, but this seems fine:
+      Nothing -> pure $ fromIntegral ix_w32
+      Just x  -> pure x
+  return (payloads, uniqs, idealHits, top 10, top 100)
+
+
+-- noops, orphans:
+instance NFData (B.BoundedCache k v) where
+  rnf _ = ()
+instance NFData (U.UnboundedCache k v) where
+  rnf _ = ()
+instance NFData (Cache.Cache k v) where
+  rnf _ = ()
diff --git a/server/src-lib/Hasura/Cache/Bounded.hs b/server/src-lib/Hasura/Cache/Bounded.hs
@@ -11,6 +11,7 @@ module Hasura.Cache.Bounded
   , initialise
   , clear
   , insert
+  , insertAllStripes
   , lookup
   , getEntries
   ) where
@@ -169,12 +170,20 @@ getLocal (BoundedCache handles) = do
 
   return $ handles V.! j
 
+-- | Insert into our thread's local cache stripe.
 insert
   :: (Hashable k, Ord k) => k -> v -> BoundedCache k v -> IO ()
 insert k v striped = do
   localHandle <- getLocal striped
   insertLocal localHandle k v
 
+-- | Insert into all stripes (non-atomically).
+insertAllStripes
+  :: (Hashable k, Ord k) =>  k -> v -> BoundedCache k v ->IO ()
+insertAllStripes k v (BoundedCache handles) = do
+  forM_ handles $ \localHandle->
+    insertLocal localHandle k v
+
 lookup :: (Hashable k, Ord k) => k -> BoundedCache k v -> IO (Maybe v)
 lookup k striped = do
   localHandle <- getLocal striped

diff --git a/server/src-lib/Hasura/Cache/Unbounded.hs b/server/src-lib/Hasura/Cache/Unbounded.hs
@@ -6,6 +6,7 @@ module Hasura.Cache.Unbounded
   , initialise
   , clear
   , insert
+  , insertAllStripes
   , lookup
   , getEntries
   ) where
@@ -75,13 +76,20 @@ getLocal (UnboundedCache handles) = do
 
   return $ handles V.! j
 
--- | Striped version of 'cached'.
+-- | Insert into our thread's local cache stripe.
 insert
   :: (Hashable k, Eq k) =>  k -> v -> UnboundedCache k v ->IO ()
 insert k v striped = do
   localHandle <- getLocal striped
   insertLocal localHandle k v
 
+-- | Insert into all stripes (non-atomically).
+insertAllStripes
+  :: (Hashable k, Eq k) =>  k -> v -> UnboundedCache k v ->IO ()
+insertAllStripes k v (UnboundedCache handles) = do
+  forM_ handles $ \localHandle->
+    insertLocal localHandle k v
+
 lookup :: (Hashable k, Eq k) => k -> UnboundedCache k v ->IO (Maybe v)
 lookup k striped = do
   localHandle <- getLocal striped