Switch builder implementation to use chunks (#10)

* switch builder implementation to use chunks
* micro-optimize the conditional for builders of length one
* use fromBoundedOne for ascii chars
This commit is contained in:
Andrew Martin 2019-10-09 16:30:02 -04:00 committed by GitHub
parent 3ad5261ff4
commit 8baf4cc369
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 347 additions and 249 deletions

View file

@ -35,6 +35,7 @@ expectedSmall :: ByteArray
expectedSmall = Bytes.toByteArray $ Bytes.fromAsciiString expectedSmall = Bytes.toByteArray $ Bytes.fromAsciiString
"((AB59,(1F33,2E71)),((((FA9A,247B),890C),(0F13,((55BF,7CF1),389B))),1205))" "((AB59,(1F33,2E71)),((((FA9A,247B),890C),(0F13,((55BF,7CF1),389B))),1205))"
exampleSmall :: Word16Tree exampleSmall :: Word16Tree
exampleSmall = Branch exampleSmall = Branch
(Branch (Branch

View file

@ -36,6 +36,7 @@ flag checked
library library
exposed-modules: exposed-modules:
Data.Bytes.Chunks
Data.ByteArray.Builder Data.ByteArray.Builder
Data.ByteArray.Builder.Unsafe Data.ByteArray.Builder.Unsafe
Data.ByteArray.Builder.Bounded Data.ByteArray.Builder.Bounded
@ -90,7 +91,7 @@ benchmark bench
, small-bytearray-builder , small-bytearray-builder
, text-short , text-short
, byteslice , byteslice
ghc-options: -Wall -O2 ghc-options: -Wall -O2 -ddump-to-file -ddump-simpl -dsuppress-all
default-language: Haskell2010 default-language: Haskell2010
hs-source-dirs: bench, common hs-source-dirs: bench, common
main-is: Main.hs main-is: Main.hs

View file

@ -1,4 +1,5 @@
{-# language BangPatterns #-} {-# language BangPatterns #-}
{-# language DataKinds #-}
{-# language DuplicateRecordFields #-} {-# language DuplicateRecordFields #-}
{-# language LambdaCase #-} {-# language LambdaCase #-}
{-# language MagicHash #-} {-# language MagicHash #-}
@ -9,19 +10,12 @@
module Data.ByteArray.Builder module Data.ByteArray.Builder
( -- * Bounded Primitives ( -- * Bounded Primitives
Builder Builder
, construct
, fromBounded , fromBounded
-- * Evaluation -- * Evaluation
, run , run
, pasteST
, pasteIO
, pasteGrowST
, pasteGrowIO
, pasteArrayST
, pasteArrayIO
-- * Materialized Byte Sequences -- * Materialized Byte Sequences
, bytes , bytes
, bytearray , byteArray
, shortTextUtf8 , shortTextUtf8
, shortTextJsonString , shortTextJsonString
, cstring , cstring
@ -77,9 +71,10 @@ module Data.ByteArray.Builder
) where ) where
import Control.Monad.Primitive (primitive_) import Control.Monad.Primitive (primitive_)
import Control.Monad.ST (ST,stToIO) import Control.Monad.ST (ST,stToIO,runST)
import Control.Monad.ST.Run (runByteArrayST) import Control.Monad.ST.Run (runByteArrayST)
import Data.ByteArray.Builder.Unsafe (Builder(Builder)) import Data.ByteArray.Builder.Unsafe (Builder(Builder))
import Data.ByteArray.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring) import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring)
import Data.ByteString.Short.Internal (ShortByteString(SBS)) import Data.ByteString.Short.Internal (ShortByteString(SBS))
import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes)) import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes))
@ -90,8 +85,9 @@ import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
import Data.Text.Short (ShortText) import Data.Text.Short (ShortText)
import Data.Word (Word64,Word32,Word16,Word8) import Data.Word (Word64,Word32,Word16,Word8)
import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,RealWorld,(>=#),(/=#)) import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,RealWorld,(>=#),(/=#))
import GHC.Exts ((+#),(-#)) import GHC.Exts (MutableByteArray#,(+#),(-#),(<#))
import GHC.ST (ST(ST)) import GHC.ST (ST(ST))
import Data.Bytes.Chunks (Chunks(..))
import qualified Arithmetic.Nat as Nat import qualified Arithmetic.Nat as Nat
import qualified Arithmetic.Types as Arithmetic import qualified Arithmetic.Types as Arithmetic
@ -102,105 +98,31 @@ import qualified Data.Vector as V
import qualified Data.ByteArray.Builder.Bounded as Bounded import qualified Data.ByteArray.Builder.Bounded as Bounded
import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
-- | Run a builder. An accurate size hint is important for -- | Run a builder.
-- good performance. The size hint should be slightly greater
-- than or equal to the actual size.
run :: run ::
Int -- ^ Hint for upper bound on size Int -- ^ Size of initial chunk (use 4080 if uncertain)
-> Builder -- ^ Builder -> Builder -- ^ Builder
-> ByteArray -> Chunks
run hint b = runByteArrayST $ do run hint@(I# hint# ) (Builder f) = runST $ do
let go !n = do MutableByteArray buf0 <- PM.newByteArray hint
arr <- PM.newByteArray n cs <- ST $ \s0 -> case f buf0 0# hint# Initial s0 of
pasteST b (MutableBytes arr 0 n) >>= \case (# s1, bufX, offX, _, csX #) ->
Nothing -> go (n + n + 16) (# s1, Mutable bufX offX csX #)
Just len -> do commitsOntoChunks ChunksNil cs
shrinkMutableByteArray arr len
PM.unsafeFreezeByteArray arr
go (max hint 16)
-- | Variant of 'pasteArrayST' that runs in 'IO'. -- Internal. This freezes all the mutable byte arrays in-place,
pasteArrayIO :: -- so be careful. It also reverses the chunks since everything
MutableBytes RealWorld -- ^ Buffer -- is backwards.
-> (a -> Builder) -- ^ Builder commitsOntoChunks :: Chunks -> Commits s -> ST s Chunks
-> V.Vector a -- ^ Elements to serialize commitsOntoChunks !xs Initial = pure xs
-> IO (V.Vector a, MutableBytes RealWorld) -- ^ Shifted vector, shifted buffer commitsOntoChunks !xs (Immutable arr off len cs) =
pasteArrayIO !arr f !xs = stToIO (pasteArrayST arr f xs) commitsOntoChunks (ChunksCons (Bytes (ByteArray arr) (I# off) (I# len)) xs) cs
commitsOntoChunks !xs (Mutable buf len cs) = case len of
-- | Fold over a vector, applying the builder to each element until 0# -> commitsOntoChunks xs cs
-- the buffer cannot accomodate any more. _ -> do
pasteArrayST :: shrinkMutableByteArray (MutableByteArray buf) (I# len)
MutableBytes s -- ^ Buffer arr <- PM.unsafeFreezeByteArray (MutableByteArray buf)
-> (a -> Builder) -- ^ Builder commitsOntoChunks (ChunksCons (Bytes arr 0 (I# len)) xs) cs
-> V.Vector a -- ^ Elements to serialize
-> ST s (V.Vector a, MutableBytes s) -- ^ Shifted vector, shifted buffer
pasteArrayST (MutableBytes arr off0 len0) f !xs0 = do
let go !xs !ixBufA !lenBufA = if V.length xs > 0
then do
let a = V.unsafeHead xs
pasteST (f a) (MutableBytes arr ixBufA lenBufA) >>= \case
Nothing -> pure (xs,MutableBytes arr ixBufA lenBufA)
Just ixBufB ->
go (V.unsafeTail xs) ixBufB (lenBufA + (ixBufA - ixBufB))
else pure (xs,MutableBytes arr ixBufA lenBufA)
go xs0 off0 len0
-- | Paste the builder into the byte array starting at offset zero.
-- This repeatedly reallocates the byte array if it cannot accomodate
-- the builder, replaying the builder each time.
pasteGrowST ::
Int -- ^ How many bytes to grow by at a time
-> Builder
-> MutableByteArrayOffset s
-- ^ Initial buffer, used linearly. Do not reuse this argument.
-> ST s (MutableByteArrayOffset s)
-- ^ Final buffer that accomodated the builder.
pasteGrowST !n b !(MutableByteArrayOffset arr0 off0) = do
let go !arr !sz = pasteST b (MutableBytes arr off0 (sz - off0)) >>= \case
Nothing -> do
let szNext = sz + n
arrNext <- PM.resizeMutableByteArray arr szNext
go arrNext szNext
Just ix -> pure (MutableByteArrayOffset{array=arr,offset=ix})
go arr0 =<< PM.getSizeofMutableByteArray arr0
-- | Variant of 'pasteGrowST' that runs in 'IO'.
pasteGrowIO ::
Int -- ^ How many bytes to grow by at a time
-> Builder
-> MutableByteArrayOffset RealWorld
-- ^ Initial buffer, used linearly. Do not reuse this argument.
-> IO (MutableByteArrayOffset RealWorld)
-- ^ Final buffer that accomodated the builder.
pasteGrowIO !n b !arr = stToIO (pasteGrowST n b arr)
-- | Execute the builder, pasting its contents into a buffer.
-- If the buffer is not large enough, this returns 'Nothing'.
-- Otherwise, it returns the index in the buffer that follows
-- the payload just written.
pasteST :: Builder -> MutableBytes s -> ST s (Maybe Int)
{-# inline pasteST #-}
pasteST (Builder f) (MutableBytes (MutableByteArray arr) (I# off) (I# len)) =
ST $ \s0 -> case f arr off len s0 of
(# s1, r #) -> if Exts.isTrue# (r /=# (-1#))
then (# s1, Just (I# r) #)
else (# s1, Nothing #)
-- | Variant of 'pasteST' that runs in 'IO'.
pasteIO :: Builder -> MutableBytes RealWorld -> IO (Maybe Int)
{-# inline pasteIO #-}
pasteIO b m = stToIO (pasteST b m)
-- | Constructor for 'Builder' that works on a function with lifted
-- arguments instead of unlifted ones. This is just as unsafe as the
-- actual constructor.
construct :: (forall s. MutableBytes s -> ST s (Maybe Int)) -> Builder
construct f = Builder
$ \arr off len s0 ->
case unST (f (MutableBytes (MutableByteArray arr) (I# off) (I# len))) s0 of
(# s1, m #) -> case m of
Nothing -> (# s1, (-1#) #)
Just (I# n) -> (# s1, n #)
-- | Convert a bounded builder to an unbounded one. If the size -- | Convert a bounded builder to an unbounded one. If the size
-- is a constant, use @Arithmetic.Nat.constant@ as the first argument -- is a constant, use @Arithmetic.Nat.constant@ as the first argument
@ -210,23 +132,51 @@ fromBounded ::
-> Bounded.Builder n -> Bounded.Builder n
-> Builder -> Builder
{-# inline fromBounded #-} {-# inline fromBounded #-}
fromBounded n (UnsafeBounded.Builder f) = Builder $ \arr off len s0 -> fromBounded n (UnsafeBounded.Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
let !(I# req) = Nat.demote n in let !(I# req) = Nat.demote n
case len >=# req of !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
1# -> f arr off s0 1# -> (# s0, buf0, off0, len0, cs0 #)
_ -> (# s0, (-1#) #) _ -> let !(I# lenX) = max 4080 (I# req) in
case Exts.newByteArray# lenX s0 of
(# sX, bufX #) ->
(# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
in case f buf1 off1 s1 of
(# s2, off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
-- This is a micro-optimization that uses an equality check instead
-- of an inequality check when the required number of bytes is one.
-- Use this instead of fromBounded (where possible) leads to marginally
-- better results in benchmarks.
fromBoundedOne ::
Bounded.Builder 1
-> Builder
{-# inline fromBoundedOne #-}
fromBoundedOne (UnsafeBounded.Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
let !(# s1, buf1, off1, len1, cs1 #) = case len0 of
0# -> case Exts.newByteArray# 4080# s0 of
(# sX, bufX #) ->
(# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
_ -> (# s0, buf0, off0, len0, cs0 #)
in case f buf1 off1 s1 of
(# s2, off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
-- | Create a builder from an unsliced byte sequence. -- | Create a builder from an unsliced byte sequence.
bytearray :: ByteArray -> Builder byteArray :: ByteArray -> Builder
bytearray a = bytes (Bytes a 0 (PM.sizeofByteArray a)) byteArray a = bytes (Bytes a 0 (PM.sizeofByteArray a))
-- | Create a builder from a sliced byte sequence. -- | Create a builder from a sliced byte sequence.
bytes :: Bytes -> Builder bytes :: Bytes -> Builder
bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len >= slen bytes (Bytes (ByteArray src# ) (I# soff# ) slen@(I# slen# )) = Builder
then do (\buf0 off0 len0 cs0 s0 -> if slen >= 1024
PM.copyByteArray arr off src soff slen then case Exts.newByteArray# 0# s0 of
pure (Just (off + slen)) (# s1, buf1 #) -> (# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #)
else pure Nothing else case len0 <# slen# of
1# -> case Exts.newByteArray# 4080# s0 of
(# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of
s2 -> (# s2, buf1, slen#, 4080# -# slen#, Mutable buf0 off0 cs0 #)
_ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in
(# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #)
)
-- | Create a builder from a slice of an array of 'Word8'. There is the same -- | Create a builder from a slice of an array of 'Word8'. There is the same
-- as 'bytes' but is provided as a convenience for users working with different -- as 'bytes' but is provided as a convenience for users working with different
@ -238,10 +188,7 @@ word8Array (PrimArray arr) off len = bytes (Bytes (ByteArray arr) off len)
-- byte sequence is UTF-8 encoded text. -- byte sequence is UTF-8 encoded text.
slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
{-# inline slicedUtf8TextJson #-} {-# inline slicedUtf8TextJson #-}
slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 dlen0) -> slicedUtf8TextJson !src# !soff0# !slen0# = fromFunction reqLen $ \dst doff0 -> do
let slen0 = I# slen0#
in if dlen0 > (2 * slen0) + 2
then do
PM.writeByteArray dst doff0 (c2w '"') PM.writeByteArray dst doff0 (c2w '"')
let go !soff !slen !doff = if slen > 0 let go !soff !slen !doff = if slen > 0
then case indexChar8Array (ByteArray src#) soff of then case indexChar8Array (ByteArray src#) soff of
@ -261,8 +208,24 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0
else pure doff else pure doff
doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1) doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
PM.writeByteArray dst doffRes (c2w '"') PM.writeByteArray dst doffRes (c2w '"')
pure (Just (doffRes + 1)) pure (doffRes + 1)
else pure Nothing where
slen0 = I# slen0#
reqLen = (2 * slen0) + 2
-- | Constructor for 'Builder' that works on a function with lifted
-- arguments instead of unlifted ones. This is just as unsafe as the
-- actual constructor.
fromFunction :: Int -> (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder
fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 ->
let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
1# -> (# s0, buf0, off0, len0, cs0 #)
_ -> let !(I# lenX) = max 4080 (I# req) in
case Exts.newByteArray# lenX s0 of
(# sX, bufX #) ->
(# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
in case unST (f (MutableByteArray buf1) (I# off1)) s1 of
(# s2, I# off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
-- Internal. Write two characters in the ASCII plane to a byte array. -- Internal. Write two characters in the ASCII plane to a byte array.
write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s () write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
@ -420,7 +383,7 @@ word8PaddedUpperHex w =
-- | Encode an ASCII char. -- | Encode an ASCII char.
-- Precondition: Input must be an ASCII character. This is not checked. -- Precondition: Input must be an ASCII character. This is not checked.
ascii :: Char -> Builder ascii :: Char -> Builder
ascii c = fromBounded Nat.constant (Bounded.ascii c) ascii c = fromBoundedOne (Bounded.ascii c)
-- | Encode an UTF8 char. This only uses as much space as is required. -- | Encode an UTF8 char. This only uses as much space as is required.
char :: Char -> Builder char :: Char -> Builder
@ -475,7 +438,7 @@ word16BE w = fromBounded Nat.constant (Bounded.word16BE w)
-- | Requires exactly 1 byte. -- | Requires exactly 1 byte.
word8 :: Word8 -> Builder word8 :: Word8 -> Builder
word8 w = fromBounded Nat.constant (Bounded.word8 w) word8 w = fromBoundedOne (Bounded.word8 w)
-- | Prefix a builder with its size in bytes. This size is -- | Prefix a builder with its size in bytes. This size is
-- presented as a big-endian 32-bit word. The need to prefix -- presented as a big-endian 32-bit word. The need to prefix
@ -490,35 +453,54 @@ word8 w = fromBounded Nat.constant (Bounded.word8 w)
-- However, using 'consLength32BE' is much more efficient here -- However, using 'consLength32BE' is much more efficient here
-- since it only materializes the 'ByteArray' once. -- since it only materializes the 'ByteArray' once.
consLength32BE :: Builder -> Builder consLength32BE :: Builder -> Builder
consLength32BE (Builder f) = Builder $ \arr off len s0 -> case len >=# 4# of consLength32BE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
1# -> case f arr (off +# 4# ) (len -# 4# ) s0 of let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 4# of
(# s1, r #) -> case r of 1# -> (# s0, buf0, off0, len0, cs0 #)
(-1#) -> (# s1, (-1#) #) _ -> case Exts.newByteArray# 4080# s0 of
_ -> (# sX, bufX #) ->
let ST g = UnsafeBounded.pasteST (# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
(Bounded.word32BE (fromIntegral ((I# r - I# off) - 4))) in case f buf1 (off1 +# 4# ) (len1 -# 4# ) cs1 s1 of
(MutableByteArray arr) (# s2, buf2, off2, len2, cs2 #) ->
(I# off) let !dist = case Exts.sameMutableByteArray# buf1 buf2 of
in case g s1 of 1# -> off2 -# off1
(# s2, _ #) -> (# s2, r #) _ -> commitDistance buf1 off2 cs2 -# off1
_ -> (# s0, (-1#) #) ST g = UnsafeBounded.pasteST
(Bounded.word32BE (fromIntegral (I# (dist -# 4# ))))
(MutableByteArray buf1)
(I# off1)
in case g s2 of
(# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #)
-- | Prefix a builder with its size in bytes. This size is -- | Prefix a builder with its size in bytes. This size is
-- presented as a big-endian 64-bit word. See 'consLength32BE'. -- presented as a big-endian 64-bit word. See 'consLength32BE'.
consLength64BE :: Builder -> Builder consLength64BE :: Builder -> Builder
consLength64BE (Builder f) = Builder $ \arr off len s0 -> case len >=# 8# of consLength64BE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
1# -> case f arr (off +# 8# ) (len -# 8# ) s0 of let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 8# of
(# s1, r #) -> case r of 1# -> (# s0, buf0, off0, len0, cs0 #)
(-1#) -> (# s1, (-1#) #) _ -> case Exts.newByteArray# 4080# s0 of
_ -> (# sX, bufX #) ->
let ST g = UnsafeBounded.pasteST (# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
(Bounded.word64BE (fromIntegral ((I# r - I# off) - 8))) in case f buf1 (off1 +# 8# ) (len1 -# 8# ) cs1 s1 of
(MutableByteArray arr) (# s2, buf2, off2, len2, cs2 #) ->
(I# off) let !dist = case Exts.sameMutableByteArray# buf1 buf2 of
in case g s1 of 1# -> off2 -# off1
(# s2, _ #) -> (# s2, r #) _ -> commitDistance buf1 off2 cs2 -# off1
_ -> (# s0, (-1#) #) ST g = UnsafeBounded.pasteST
(Bounded.word64BE (fromIntegral (I# (dist -# 8# ))))
(MutableByteArray buf1)
(I# off1)
in case g s2 of
(# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #)
commitDistance :: MutableByteArray# s -> Int# -> Commits s -> Int#
commitDistance _ !_ Initial = error "chunkDistance: chunk not found"
commitDistance target !n (Immutable _ _ len cs) =
commitDistance target (n +# len) cs
commitDistance target !n (Mutable buf len cs) =
case Exts.sameMutableByteArray# target buf of
1# -> n +# len
_ -> commitDistance target (n +# len) cs
-- ShortText is already UTF-8 encoded. This is a no-op. -- ShortText is already UTF-8 encoded. This is a no-op.
shortTextToByteArray :: ShortText -> ByteArray shortTextToByteArray :: ShortText -> ByteArray

View file

@ -9,6 +9,7 @@
module Data.ByteArray.Builder.Unsafe module Data.ByteArray.Builder.Unsafe
( -- * Types ( -- * Types
Builder(..) Builder(..)
, Commits(..)
-- * Safe Functions -- * Safe Functions
-- | These functions are actually completely safe, but they are defined -- | These functions are actually completely safe, but they are defined
-- here because they are used by typeclass instances. Import them from -- here because they are used by typeclass instances. Import them from
@ -17,10 +18,10 @@ module Data.ByteArray.Builder.Unsafe
, cstring , cstring
) where ) where
import Data.Primitive (MutableByteArray(MutableByteArray)) import Data.Primitive (MutableByteArray(MutableByteArray),ByteArray)
import Foreign.C.String (CString) import Foreign.C.String (CString)
import GHC.Exts ((-#),(+#),(/=#),(>#)) import GHC.Exts ((-#),(+#),(>#))
import GHC.Exts (Addr#,Int(I#),Ptr(Ptr)) import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr))
import GHC.Exts (IsString,Int#,State#,MutableByteArray#) import GHC.Exts (IsString,Int#,State#,MutableByteArray#)
import GHC.ST (ST(ST)) import GHC.ST (ST(ST))
import GHC.Base (unpackCString#,unpackCStringUtf8#) import GHC.Base (unpackCString#,unpackCStringUtf8#)
@ -32,10 +33,14 @@ import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
-- | An unmaterialized sequence of bytes that may be pasted -- | An unmaterialized sequence of bytes that may be pasted
-- into a mutable byte array. -- into a mutable byte array.
newtype Builder newtype Builder
= Builder (forall s. MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)) = Builder (forall s.
-- ^ This function takes a buffer, an offset, and a number of remaining bytes. MutableByteArray# s -> -- buffer we are currently writing to
-- It returns the new offset (should be greater than the old offset), or if Int# -> -- offset into the current buffer
-- there was not enough space left in buffer, it returns -1. Int# -> -- number of bytes remaining in the current buffer
Commits s -> -- buffers and immutable byte slices that we have already committed
State# s ->
(# State# s, MutableByteArray# s, Int#, Int#, Commits s #) -- all the same things
)
instance IsString Builder where instance IsString Builder where
{-# inline fromString #-} {-# inline fromString #-}
@ -43,53 +48,73 @@ instance IsString Builder where
instance Semigroup Builder where instance Semigroup Builder where
{-# inline (<>) #-} {-# inline (<>) #-}
Builder f <> Builder g = Builder $ \arr off0 len0 s0 -> case f arr off0 len0 s0 of Builder f <> Builder g = Builder $ \buf0 off0 len0 cs0 s0 -> case f buf0 off0 len0 cs0 s0 of
(# s1, r #) -> case r /=# (-1#) of (# s1, buf1, off1, len1, cs1 #) -> g buf1 off1 len1 cs1 s1
1# -> g arr r (len0 +# (off0 -# r)) s1
_ -> (# s1, (-1#) #)
instance Monoid Builder where instance Monoid Builder where
{-# inline mempty #-} {-# inline mempty #-}
mempty = Builder $ \_ off0 _ s0 -> (# s0, off0 #) mempty = Builder $ \buf0 off0 len0 cs0 s0 -> (# s0, buf0, off0, len0, cs0 #)
data Commits s
= Mutable
(MutableByteArray# s)
-- ^ Mutable buffer, start index implicitly zero
Int# -- ^ Length (may be smaller than actual length)
!(Commits s)
| Immutable
ByteArray# -- ^ Immutable chunk
Int# -- ^ Offset into chunk, not necessarily zero
Int# -- ^ Length (may be smaller than actual length)
!(Commits s)
| Initial
-- | Create a builder from a cons-list of 'Char'. These -- | Create a builder from a cons-list of 'Char'. These
-- are be UTF-8 encoded. -- are be UTF-8 encoded.
stringUtf8 :: String -> Builder stringUtf8 :: String -> Builder
{-# inline stringUtf8 #-} {-# inline stringUtf8 #-}
stringUtf8 cs = Builder (\arr off0 len0 s0 -> goString cs arr off0 len0 s0) stringUtf8 cs = Builder (goString cs)
-- | Create a builder from a @NUL@-terminated 'CString'. This ignores any -- | Create a builder from a @NUL@-terminated 'CString'. This ignores any
-- textual encoding, copying bytes until @NUL@ is reached. -- textual encoding, copying bytes until @NUL@ is reached.
cstring :: CString -> Builder cstring :: CString -> Builder
{-# inline cstring #-} {-# inline cstring #-}
cstring (Ptr cs) = Builder (\arr off0 len0 s0 -> goCString cs arr off0 len0 s0) cstring (Ptr cs) = Builder (goCString cs)
goString :: String -> MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) goString :: String
-> MutableByteArray# s -> Int# -> Int# -> Commits s
-> State# s -> (# State# s, MutableByteArray# s, Int#, Int#, Commits s #)
{-# noinline goString #-} {-# noinline goString #-}
goString [] _ off0 _ s0 = (# s0, off0 #) goString [] buf0 off0 len0 cs0 s0 = (# s0, buf0, off0, len0, cs0 #)
goString (c : cs) buf off0 len0 s0 = case len0 ># 3# of goString (c : cs) buf0 off0 len0 cs0 s0 = case len0 ># 3# of
1# -> case unST (UnsafeBounded.pasteST (Bounded.char c) (MutableByteArray buf) (I# off0)) s0 of 1# -> case unST (UnsafeBounded.pasteST (Bounded.char c) (MutableByteArray buf0) (I# off0)) s0 of
(# s1, I# off1 #) -> goString cs buf off1 (len0 -# (off1 -# off0)) s1 (# s1, I# off1 #) -> goString cs buf0 off1 (len0 -# (off1 -# off0)) cs0 s1
_ -> (# s0, (-1#) #) _ -> case Exts.newByteArray# 4080# s0 of
(# s1, buf1 #) -> case unST (UnsafeBounded.pasteST (Bounded.char c) (MutableByteArray buf1) 0) s1 of
(# s2, I# off1 #) -> goString cs buf1 off1 (4080# -# off1) (Mutable buf0 off0 cs0) s2
-- We have to have a rule for both unpackCString# and unpackCStringUtf8# -- We have to have a rule for both unpackCString# and unpackCStringUtf8#
-- since GHC uses a different function based on whether or not non-ASCII -- since GHC uses a different function based on whether or not non-ASCII
-- codepoints are used in the string. -- codepoints are used in the string.
{-# RULES {-# RULES
"Builder stringUtf8/cstring" forall s a b c d. "Builder stringUtf8/cstring" forall s a b c d e.
goString (unpackCString# s) a b c d = goCString s a b c d goString (unpackCString# s) a b c d e = goCString s a b c d e
"Builder stringUtf8/cstring-utf8" forall s a b c d. "Builder stringUtf8/cstring-utf8" forall s a b c d e.
goString (unpackCStringUtf8# s) a b c d = goCString s a b c d goString (unpackCStringUtf8# s) a b c d e = goCString s a b c d e
#-} #-}
goCString :: Addr# -> MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) goCString :: Addr# -> MutableByteArray# s -> Int# -> Int# -> Commits s
goCString addr buf off0 len0 s0 = case Exts.indexWord8OffAddr# addr 0# of -> State# s -> (# State# s, MutableByteArray# s, Int#, Int#, Commits s #)
0## -> (# s0, off0 #) goCString addr buf0 off0 len0 cs0 s0 = case Exts.indexWord8OffAddr# addr 0# of
0## -> (# s0, buf0, off0, len0, cs0 #)
w -> case len0 of w -> case len0 of
0# -> (# s0, (-1#) #) 0# -> case Exts.newByteArray# 4080# s0 of
_ -> case Exts.writeWord8Array# buf off0 w s0 of (# s1, buf1 #) -> case Exts.writeWord8Array# buf1 0# w s1 of
s1 -> goCString (Exts.plusAddr# addr 1# ) buf (off0 +# 1# ) (len0 -# 1# ) s1 s2 -> goCString
(Exts.plusAddr# addr 1# ) buf1 1# (4080# -# 1# )
(Mutable buf0 off0 cs0)
s2
_ -> case Exts.writeWord8Array# buf0 off0 w s0 of
s1 -> goCString (Exts.plusAddr# addr 1# ) buf0 (off0 +# 1# ) (len0 -# 1# ) cs0 s1
unST :: ST s a -> State# s -> (# State# s, a #) unST :: ST s a -> State# s -> (# State# s, a #)
unST (ST f) = f unST (ST f) = f

76
src/Data/Bytes/Chunks.hs Normal file
View file

@ -0,0 +1,76 @@
{-# language BangPatterns #-}
{-# language DerivingStrategies #-}
{-# language TypeFamilies #-}
{-# language MagicHash #-}
{-# language UnboxedTuples #-}
{-# language NamedFieldPuns #-}
module Data.Bytes.Chunks
( Chunks(..)
, concat
) where
import Prelude hiding (length,concat)
import GHC.ST (ST(..))
import Data.Bytes.Types (Bytes(..))
import Data.Primitive (ByteArray(..),MutableByteArray(..))
import GHC.Exts (ByteArray#,MutableByteArray#)
import GHC.Exts (IsList,Int#,State#,Int(I#),(+#),(-#))
import Control.Monad.ST.Run (runByteArrayST)
import qualified GHC.Exts as Exts
import qualified Data.Primitive as PM
data Chunks
= ChunksCons {-# UNPACK #-} !Bytes !Chunks
| ChunksNil
concat :: Chunks -> ByteArray
concat x = ByteArray (concat# x)
concat# :: Chunks -> ByteArray#
{-# noinline concat# #-}
concat# ChunksNil = case mempty of {ByteArray x -> x}
concat# (ChunksCons (Bytes{array=c,offset=coff,length=szc}) cs) = case cs of
ChunksNil -> case c of {ByteArray x -> x}
ChunksCons (Bytes{array=d,offset=doff,length=szd}) ds ->
unBa $ runByteArrayST $ do
let szboth = szc + szd
len = chunksLengthGo szboth ds
dst <- PM.newByteArray len
PM.copyByteArray dst 0 c coff szc
PM.copyByteArray dst szc d doff szd
_ <- copy dst szboth ds
PM.unsafeFreezeByteArray dst
chunksLengthGo :: Int -> Chunks -> Int
chunksLengthGo !n ChunksNil = n
chunksLengthGo !n (ChunksCons (Bytes{length}) cs) =
chunksLengthGo (n + length) cs
-- | Copy the contents of the chunks into a mutable array.
-- Precondition: The destination must have enough space to
-- house the contents. This is not checked.
copy ::
MutableByteArray s -- ^ Destination
-> Int -- ^ Destination offset
-> Chunks -- ^ Source
-> ST s Int -- ^ Returns the next index into the destination after the payload
{-# inline copy #-}
copy (MutableByteArray dst) (I# off) cs = ST
(\s0 -> case copy# dst off cs s0 of
(# s1, nextOff #) -> (# s1, I# nextOff #)
)
copy# :: MutableByteArray# s -> Int# -> Chunks -> State# s -> (# State# s, Int# #)
copy# _ off ChunksNil s0 = (# s0, off #)
copy# marr off (ChunksCons (Bytes{array,offset,length}) cs) s0 =
case Exts.copyByteArray# (unBa array) (unI offset) marr off (unI length) s0 of
s1 -> copy# marr (off +# unI length) cs s1
unI :: Int -> Int#
unI (I# i) = i
unBa :: ByteArray -> ByteArray#
unBa (ByteArray x) = x

View file

@ -18,6 +18,7 @@ import qualified Arithmetic.Nat as Nat
import qualified Data.ByteString as ByteString import qualified Data.ByteString as ByteString
import qualified Data.ByteString.Builder as BB import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Lazy.Char8 as LB import qualified Data.ByteString.Lazy.Char8 as LB
import qualified Data.Bytes.Chunks as Chunks
import qualified Data.List as L import qualified Data.List as L
import qualified Data.Primitive as PM import qualified Data.Primitive as PM
import qualified Data.Text as T import qualified Data.Text as T
@ -37,79 +38,99 @@ tests :: TestTree
tests = testGroup "Tests" tests = testGroup "Tests"
[ testGroup "live" [ testGroup "live"
[ TQC.testProperty "word64Dec" $ \w -> [ TQC.testProperty "word64Dec" $ \w ->
run 1 (word64Dec w) === pack (show w) runConcat 1 (word64Dec w) === pack (show w)
, TQC.testProperty "word64Dec-x3" $ \x y z -> , TQC.testProperty "word64Dec-x3" $ \x y z ->
run 1 (word64Dec x <> word64Dec y <> word64Dec z) runConcat 1 (word64Dec x <> word64Dec y <> word64Dec z)
=== ===
pack (show x ++ show y ++ show z) pack (show x ++ show y ++ show z)
, TQC.testProperty "int64Dec-x3" $ \x y z -> , TQC.testProperty "int64Dec-x3" $ \x y z ->
run 1 (int64Dec x <> int64Dec y <> int64Dec z) runConcat 1 (int64Dec x <> int64Dec y <> int64Dec z)
=== ===
pack (show x ++ show y ++ show z) pack (show x ++ show y ++ show z)
, TQC.testProperty "word64BE-x3" $ \x y z -> , TQC.testProperty "word64BE-x3" $ \x y z ->
run 1 (word64BE x <> word64BE y <> word64BE z) runConcat 1 (word64BE x <> word64BE y <> word64BE z)
=== ===
pack (LB.unpack (BB.toLazyByteString (BB.word64BE x <> BB.word64BE y <> BB.word64BE z))) pack (LB.unpack (BB.toLazyByteString (BB.word64BE x <> BB.word64BE y <> BB.word64BE z)))
, TQC.testProperty "word64PaddedUpperHex" $ \w -> , TQC.testProperty "word64PaddedUpperHex" $ \w ->
run 1 (word64PaddedUpperHex w) runConcat 1 (word64PaddedUpperHex w)
=== ===
pack (showWord64PaddedUpperHex w) pack (showWord64PaddedUpperHex w)
, TQC.testProperty "word8Dec" $ \w -> , TQC.testProperty "word8Dec" $ \w ->
run 1 (word8Dec w) runConcat 1 (word8Dec w)
=== ===
pack (show w) pack (show w)
, TQC.testProperty "consLength32BE" $ \w -> , TQC.testProperty "consLength32BE" $ \w ->
run 1 (consLength32BE (word8Dec w)) runConcat 1 (consLength32BE (word8Dec w))
=== ===
pack ('\x00' : '\x00' : '\x00' : chr (L.length (show w)) : show w) pack ('\x00' : '\x00' : '\x00' : chr (L.length (show w)) : show w)
, TQC.testProperty "consLength64BE" $ \w -> , TQC.testProperty "consLength64BE" $ \w ->
run 1 (consLength64BE (word16Dec w)) pack
( '\x00' : '\x00' : '\x00' : '\x00'
: '\x00' : '\x00' : '\x00' : chr (L.length (show w))
: show w
)
=== ===
pack ('\x00' : '\x00' : '\x00' : '\x00' : '\x00' : '\x00' : '\x00' : chr (L.length (show w)) : show w) runConcat 1 (consLength64BE (word16Dec w))
, TQC.testProperty "pasteArrayST" $ \(xs :: [Word64]) ->
(runArray word64Dec (V.fromList xs))
===
pack (foldMap show xs)
, THU.testCase "stringUtf8" $ , THU.testCase "stringUtf8" $
packUtf8 "¿Cómo estás? I am doing well." @=? packUtf8 "¿Cómo estás? I am doing well." @=?
run 1 (stringUtf8 "¿Cómo estás? I am doing well.") runConcat 1 (stringUtf8 "¿Cómo estás? I am doing well.")
, THU.testCase "doubleDec-A" $ , THU.testCase "doubleDec-A" $
pack (show (2 :: Int)) @=? run 1 (doubleDec 2.0) pack (show (2 :: Int)) @=? runConcat 1 (doubleDec 2.0)
, THU.testCase "doubleDec-B" $ , THU.testCase "doubleDec-B" $
pack (show (2.5 :: Double)) @=? run 1 (doubleDec 2.5) pack (show (2.5 :: Double)) @=? runConcat 1 (doubleDec 2.5)
, THU.testCase "doubleDec-C" $ , THU.testCase "doubleDec-C" $
pack ("1e+15") @=? run 1 (doubleDec 1e15) pack ("1e+15") @=? runConcat 1 (doubleDec 1e15)
, THU.testCase "doubleDec-D" $ , THU.testCase "doubleDec-D" $
pack ("-42") @=? run 1 (doubleDec (-42)) pack ("-42") @=? runConcat 1 (doubleDec (-42))
, THU.testCase "doubleDec-E" $ , THU.testCase "doubleDec-E" $
pack ("-8.88888888888888e+14") @=? run 1 (doubleDec (-888888888888888.8888888)) pack ("-8.88888888888888e+14") @=? runConcat 1 (doubleDec (-888888888888888.8888888))
, THU.testCase "doubleDec-F" $ , THU.testCase "doubleDec-F" $
pack ("42") @=? run 1 (doubleDec 42) pack ("42") @=? runConcat 1 (doubleDec 42)
, THU.testCase "doubleDec-G" $ , THU.testCase "doubleDec-G" $
pack ("0") @=? run 1 (doubleDec 0) pack ("0") @=? runConcat 1 (doubleDec 0)
, THU.testCase "doubleDec-H" $ , THU.testCase "doubleDec-H" $
pack ("0.5") @=? run 1 (doubleDec 0.5) pack ("0.5") @=? runConcat 1 (doubleDec 0.5)
, THU.testCase "doubleDec-I" $ , THU.testCase "doubleDec-I" $
pack ("-0.5") @=? run 1 (doubleDec (-0.5)) pack ("-0.5") @=? runConcat 1 (doubleDec (-0.5))
, THU.testCase "doubleDec-J" $ , THU.testCase "doubleDec-J" $
pack ("999999999") @=? run 1 (doubleDec 999999999) pack ("999999999") @=? runConcat 1 (doubleDec 999999999)
, THU.testCase "doubleDec-K" $ , THU.testCase "doubleDec-K" $
pack ("-99999999") @=? run 1 (doubleDec (-99999999)) pack ("-99999999") @=? runConcat 1 (doubleDec (-99999999))
, THU.testCase "shortTextJsonString-A" $ , THU.testCase "shortTextJsonString-A" $
pack ("\"hello\"") @=? run 1 (shortTextJsonString "hello") pack ("\"hello\"") @=? runConcat 1 (shortTextJsonString "hello")
, THU.testCase "shortTextJsonString-B" $ , THU.testCase "shortTextJsonString-B" $
pack ("\"\\\\_\\\"_/\"") @=? run 1 (shortTextJsonString "\\_\"_/") pack ("\"\\\\_\\\"_/\"") @=? runConcat 1 (shortTextJsonString "\\_\"_/")
, THU.testCase "shortTextJsonString-C" $ , THU.testCase "shortTextJsonString-C" $
pack ("\"Hi\\r\\nLo\"") @=? run 1 (shortTextJsonString "Hi\r\nLo") pack ("\"Hi\\r\\nLo\"") @=? runConcat 1 (shortTextJsonString "Hi\r\nLo")
, THU.testCase "shortTextJsonString-D" $ , THU.testCase "shortTextJsonString-D" $
pack ("\"Hi\\u001BLo\"") @=? run 1 (shortTextJsonString "Hi\ESCLo") pack ("\"Hi\\u001BLo\"") @=? runConcat 1 (shortTextJsonString "Hi\ESCLo")
, THU.testCase "word-16-tree" $ , THU.testCase "word-16-tree" $
Word16Tree.expectedSmall @=? run 1 Word16Tree.expectedSmall @=? runConcat 1
(Word16Tree.encode Word16Tree.exampleSmall) (Word16Tree.encode Word16Tree.exampleSmall)
, THU.testCase "byteArray-small" $
let a = replicateByte 3 0x50
b = replicateByte 5 0x51
in mconcat [a,b] @=? runConcat 1
( byteArray a <> byteArray b )
, THU.testCase "byteArray-big" $
let a = replicateByte 2105 0x50
b = replicateByte 725 0x51
c = replicateByte 900 0x52
d = replicateByte 800 0x53
e = replicateByte 700 0x54
f = replicateByte 950 0x55
g = replicateByte 975 0x56
h = replicateByte 3000 0x57
i = replicateByte 125 0x58
in mconcat [a,b,c,d,e,f,g,h,i] @=? runConcat 1
( byteArray a <> byteArray b <> byteArray c <>
byteArray d <> byteArray e <> byteArray f <>
byteArray g <> byteArray h <> byteArray i
)
] ]
, testGroup "alternate" , testGroup "alternate"
[ TQC.testProperty "HexWord64" $ \x y -> [ TQC.testProperty "HexWord64" $ \x y ->
run 1 runConcat 1
( fromBounded Nat.constant (HexWord64.word64PaddedUpperHex x) ( fromBounded Nat.constant (HexWord64.word64PaddedUpperHex x)
<> fromBounded Nat.constant (HexWord64.word64PaddedUpperHex y) <> fromBounded Nat.constant (HexWord64.word64PaddedUpperHex y)
) )
@ -118,28 +139,20 @@ tests = testGroup "Tests"
] ]
] ]
replicateByte :: Int -> Word8 -> ByteArray
replicateByte n w = runST $ do
m <- PM.newByteArray n
PM.setByteArray m 0 n w
PM.unsafeFreezeByteArray m
pack :: String -> ByteArray pack :: String -> ByteArray
pack = Exts.fromList . map (fromIntegral @Int @Word8 . ord) pack = Exts.fromList . map (fromIntegral @Int @Word8 . ord)
packUtf8 :: String -> ByteArray packUtf8 :: String -> ByteArray
packUtf8 = Exts.fromList . ByteString.unpack . TE.encodeUtf8 . T.pack packUtf8 = Exts.fromList . ByteString.unpack . TE.encodeUtf8 . T.pack
-- This is used to test pasteArrayST
runArray ::
(a -> Builder) -- ^ Builder
-> V.Vector a -- ^ Elements to serialize
-> ByteArray -- ^ Number of elements serialized, serialization
runArray f !xs = runST $ do
let go !v0 !sz !chunks = if V.null v0
then pure (mconcat (L.reverse chunks))
else do
arr <- PM.newByteArray sz
(v1,MutableBytes _ off _) <- pasteArrayST (MutableBytes arr 0 sz) f v0
-- If nothing was serialized, we need a bigger buffer
let szNext = if V.length v0 == V.length v1 then sz + 1 else sz
c <- PM.unsafeFreezeByteArray =<< PM.resizeMutableByteArray arr off
go v1 szNext (c : chunks)
go xs 1 []
showWord64PaddedUpperHex :: Word64 -> String showWord64PaddedUpperHex :: Word64 -> String
showWord64PaddedUpperHex = printf "%016X" showWord64PaddedUpperHex = printf "%016X"
runConcat :: Int -> Builder -> ByteArray
runConcat n = Chunks.concat . run n