Skip to content

Commit

Permalink
Improve deserialisation performance
Browse files Browse the repository at this point in the history
Use the lower-level array-construction primitives to avoid intermediate
allocations and perform much better at deserialisation.

On Cabal (which uses tar for the hackage index), we observed:
* Deserialisation of IntTries go from 1.5s to 200ms, with 10GB of allocations going down to roughly 600MB.
* StringTable deserialization go from 700ms to 50ms, with 4GB of allocations going down to 80MB.

Unfortunately, the newGenArray primitive was only introduced in array 0.5.6.
Since we can't update the bound to force such a recent version of array,
we implement the beToLe function using unboxed array primitives that
have been long available, rather than newGenArray.
  • Loading branch information
alt-romes authored and Bodigrim committed Jun 14, 2024
1 parent 0ce2ead commit af3d0c7
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 54 deletions.
21 changes: 9 additions & 12 deletions Codec/Archive/Tar/Index/IntTrie.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
{-# LANGUAGE CPP, BangPatterns, PatternGuards #-}
{-# LANGUAGE DeriveDataTypeable, ScopedTypeVariables #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnboxedTuples #-}
{-# OPTIONS_HADDOCK hide #-}

module Codec.Archive.Tar.Index.IntTrie (
Expand Down Expand Up @@ -55,6 +57,9 @@ import Data.IntMap.Strict (IntMap)

import Data.List hiding (lookup, insert)
import Data.Function (on)
import GHC.IO

import Codec.Archive.Tar.Index.Utils

-- | A compact mapping from sequences of nats to nats.
--
Expand Down Expand Up @@ -338,19 +343,11 @@ deserialise bs
, let lenArr = readWord32BE bs 0
lenTotal = 4 + 4 * fromIntegral lenArr
, BS.length bs >= 4 + 4 * fromIntegral lenArr
, let !arr = A.array (0, lenArr-1)
[ (i, readWord32BE bs off)
| (i, off) <- zip [0..lenArr-1] [4,8 .. lenTotal - 4] ]
!bs' = BS.drop lenTotal bs
= Just (IntTrie arr, bs')
, let !bs_without_len = BS.unsafeDrop 4 bs
!bs_remaining = BS.unsafeDrop lenTotal bs
!arr = unsafePerformIO $ beToLe lenArr bs_without_len
= Just (IntTrie arr, bs_remaining)

| otherwise
= Nothing

readWord32BE :: BS.ByteString -> Int -> Word32
readWord32BE bs i =
assert (i >= 0 && i+3 <= BS.length bs - 1) $
fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24
+ fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16
+ fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8
+ fromIntegral (BS.unsafeIndex bs (i + 3))
17 changes: 5 additions & 12 deletions Codec/Archive/Tar/Index/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ import Codec.Archive.Tar.Read as Tar
import qualified Codec.Archive.Tar.Index.StringTable as StringTable
import Codec.Archive.Tar.Index.StringTable (StringTable, StringTableBuilder)
import qualified Codec.Archive.Tar.Index.IntTrie as IntTrie
import Codec.Archive.Tar.Index.Utils (readWord32BE)
import Codec.Archive.Tar.Index.IntTrie (IntTrie, IntTrieBuilder)
import Codec.Archive.Tar.PackAscii

Expand Down Expand Up @@ -496,28 +497,20 @@ deserialise bs

| let ver = readWord32BE bs 0
, ver == 1
= do let !finalOffset = readWord32BE bs 4
(stringTable, bs') <- StringTable.deserialiseV1 (BS.drop 8 bs)
= do let !finalOffset = readWord32BE bs 1
(stringTable, bs') <- StringTable.deserialiseV1 (BS.unsafeDrop 8 bs)
(intTrie, bs'') <- IntTrie.deserialise bs'
return (TarIndex stringTable intTrie finalOffset, bs'')

| let ver = readWord32BE bs 0
, ver == 2
= do let !finalOffset = readWord32BE bs 4
(stringTable, bs') <- StringTable.deserialiseV2 (BS.drop 8 bs)
= do let !finalOffset = readWord32BE bs 1
(stringTable, bs') <- StringTable.deserialiseV2 (BS.unsafeDrop 8 bs)
(intTrie, bs'') <- IntTrie.deserialise bs'
return (TarIndex stringTable intTrie finalOffset, bs'')

| otherwise = Nothing

readWord32BE :: BS.ByteString -> Int -> Word32
readWord32BE bs i =
assert (i >= 0 && i+3 <= BS.length bs - 1) $
fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24
+ fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16
+ fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8
+ fromIntegral (BS.unsafeIndex bs (i + 3))

toStrict :: LBS.ByteString -> BS.ByteString
toStrict = LBS.toStrict

Expand Down
56 changes: 26 additions & 30 deletions Codec/Archive/Tar/Index/StringTable.hs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import Data.Monoid ((<>))
import Control.Exception (assert)

import qualified Data.Array.Unboxed as A
import qualified Data.Array.Base as A
import Data.Array.Unboxed ((!))
import qualified Data.Map.Strict as Map
import Data.Map.Strict (Map)
Expand All @@ -45,6 +46,10 @@ import qualified Data.ByteString.Unsafe as BS
import qualified Data.ByteString.Lazy as LBS
import Data.ByteString.Builder as BS
import Data.ByteString.Builder.Extra as BS (byteStringCopy)
import GHC.IO (unsafePerformIO)

import Unsafe.Coerce (unsafeCoerce)
import Codec.Archive.Tar.Index.Utils

-- | An efficient mapping from strings to a dense set of integers.
--
Expand Down Expand Up @@ -169,10 +174,10 @@ deserialiseV1 :: BS.ByteString -> Maybe (StringTable id, BS.ByteString)
deserialiseV1 bs
| BS.length bs >= 8
, let lenStrs = fromIntegral (readWord32BE bs 0)
lenArr = fromIntegral (readWord32BE bs 4)
lenArr = fromIntegral (readWord32BE bs 1)
lenTotal= 8 + lenStrs + 4 * lenArr
, BS.length bs >= lenTotal
, let strs = BS.take lenStrs (BS.drop 8 bs)
, let strs = BS.unsafeTake lenStrs (BS.unsafeDrop 8 bs)
arr = A.array (0, fromIntegral lenArr - 1)
[ (i, readWord32BE bs off)
| (i, off) <- zip [0 .. fromIntegral lenArr - 1]
Expand All @@ -194,41 +199,32 @@ deserialiseV2 :: BS.ByteString -> Maybe (StringTable id, BS.ByteString)
deserialiseV2 bs
| BS.length bs >= 8
, let lenStrs = fromIntegral (readWord32BE bs 0)
lenArr = fromIntegral (readWord32BE bs 4)
lenArr = fromIntegral (readWord32BE bs 1)
lenTotal= 8 -- the two length prefixes
+ lenStrs
+ 4 * lenArr
+(4 * (lenArr - 1)) * 2 -- offsets array is 1 longer
, BS.length bs >= lenTotal
, let strs = BS.take lenStrs (BS.drop 8 bs)
offs = A.listArray (0, fromIntegral lenArr - 1)
[ readWord32BE bs off
| off <- offsets offsOff ]
-- the second two arrays are 1 shorter
ids = A.listArray (0, fromIntegral lenArr - 2)
[ readInt32BE bs off
| off <- offsets idsOff ]
ixs = A.listArray (0, fromIntegral lenArr - 2)
[ readInt32BE bs off
| off <- offsets ixsOff ]
offsOff = 8 + lenStrs
idsOff = offsOff + 4 * lenArr
ixsOff = idsOff + 4 * (lenArr-1)
offsets from = [from,from+4 .. from + 4 * (lenArr - 1)]
, let strs = BS.unsafeTake lenStrs (BS.unsafeDrop 8 bs)
offs_bs = BS.unsafeDrop (8 + lenStrs) bs
ids_bs = BS.unsafeDrop (lenArr * 4) offs_bs
ixs_bs = BS.unsafeDrop ((lenArr - 1) * 4) ids_bs

castArray :: A.UArray i Word32 -> A.UArray i Int32
castArray (A.UArray a b c d) = (A.UArray a b c d)

-- Bangs are crucial for this to work in spite of unsafePerformIO!
(offs, ids, ixs) = unsafePerformIO $ do
!r1 <- beToLe (fromIntegral lenArr) offs_bs
!r2 <- castArray <$> beToLe (fromIntegral lenArr - 1) ids_bs
!r3 <- castArray <$> beToLe (fromIntegral lenArr - 1) ixs_bs
return (r1, r2, r3)


!stringTable = StringTable strs offs ids ixs
!bs' = BS.drop lenTotal bs
= Just (stringTable, bs')
!bs_left = BS.drop lenTotal bs
= Just (stringTable, bs_left)

| otherwise
= Nothing

readInt32BE :: BS.ByteString -> Int -> Int32
readInt32BE bs i = fromIntegral (readWord32BE bs i)

readWord32BE :: BS.ByteString -> Int -> Word32
readWord32BE bs i =
assert (i >= 0 && i+3 <= BS.length bs - 1) $
fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24
+ fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16
+ fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8
+ fromIntegral (BS.unsafeIndex bs (i + 3))
84 changes: 84 additions & 0 deletions Codec/Archive/Tar/Index/Utils.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
{-# LANGUAGE MagicHash, UnboxedTuples, BangPatterns, CPP #-}
module Codec.Archive.Tar.Index.Utils where

import Data.ByteString as BS
import Control.Exception (assert)

import Data.ByteString.Internal (ByteString(..), unsafeWithForeignPtr, accursedUnutterablePerformIO)
import GHC.Int (Int(..), Int32)
import GHC.Word (Word32(..), byteSwap32)
import Foreign.Storable (peek)
import GHC.Ptr (castPtr, plusPtr, Ptr)
import GHC.Exts
import GHC.IO (IO(..), unsafePerformIO)
import Data.Array.Base
import Data.Array.IO.Internals (unsafeFreezeIOUArray)
import Control.DeepSeq (NFData(..))
import GHC.Storable
import GHC.ByteOrder

#include <ghcautoconf.h>

-- | Construct a `UArray Word32 Word32` from a ByteString of 32bit big endian
-- words.
--
-- Note: If using `unsafePerformIO`, be sure to force the result of running the
-- IO action right away... (e.g. see calls to beToLe in StringTable)
beToLe :: (Integral i, Num i) => i
-- ^ The total array length (the number of 32bit words in the array)
-> BS.ByteString
-- ^ The bytestring from which the UArray is constructed.
-- The content must start in the first byte! (i.e. the meta-data words
-- that shouldn't be part of the array, must have been dropped already)
-> IO (UArray i Word32)
beToLe lenArr (BS fptr _) = do
unsafeWithForeignPtr fptr $ \ptr -> do
let ptr' = castPtr ptr :: Ptr Word32
!(I# lenBytes#) = fromIntegral (lenArr * 4)

-- In spirit, the following does this, but we can't use `newGenArray`
-- because it only has been introduced in later versions of array:
-- @@
-- unsafeFreezeIOUArray =<<
-- newGenArray (0, lenArr - 1) (\offset -> do
-- byteSwap32 <$> peek (ptr' `plusPtr` (fromIntegral offset * 4)))
-- @@
IO $ \rw0 ->
case newByteArray# lenBytes# rw0 of
(# rw1, mba# #) ->

let loop :: Int -> State# RealWorld -> State# RealWorld
loop !offset st
| offset < fromIntegral lenArr
= let IO getV = readWord32OffPtrBE ptr' offset
!(I# o#) = offset
in case getV st of
(# st', W32# v# #) ->
loop (offset + 1) (writeWord32Array# mba# o# v# st')
| otherwise = st

in case unsafeFreezeByteArray# mba# (loop 0 rw1) of
(# rw2, ba# #) -> (# rw2, UArray 0 (lenArr - 1) (fromIntegral lenArr) ba# #)

{-# SPECIALISE beToLe :: Word32 -> BS.ByteString -> IO (UArray Word32 Word32) #-}
{-# SPECIALISE beToLe :: Int32 -> BS.ByteString -> IO (UArray Int32 Word32) #-}

readInt32BE :: BS.ByteString -> Int -> Int32
readInt32BE bs i = fromIntegral (readWord32BE bs i)
{-# INLINE readInt32BE #-}

readWord32OffPtrBE :: Ptr Word32 -> Int -> IO Word32
readWord32OffPtrBE ptr i = do
#if defined(WORDS_BIGENDIAN)
readWord32OffPtr ptr i
#else
byteSwap32 <$> readWord32OffPtr ptr i
#endif

readWord32BE :: BS.ByteString -> Int -> Word32
readWord32BE (BS fptr len) i =
assert (i >= 0 && i+3 <= len - 1) $
accursedUnutterablePerformIO $
unsafeWithForeignPtr fptr $ \ptr -> do
readWord32OffPtrBE (castPtr ptr) i
{-# INLINE readWord32BE #-}
1 change: 1 addition & 0 deletions tar.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ library tar-internal
Codec.Archive.Tar.Index.StringTable
Codec.Archive.Tar.Index.IntTrie
Codec.Archive.Tar.Index.Internal
Codec.Archive.Tar.Index.Utils

other-extensions:
BangPatterns
Expand Down

0 comments on commit af3d0c7

Please sign in to comment.