Add putMany and considate consLength functions

This commit is contained in:
Andrew Martin 2019-11-22 16:24:48 -05:00
parent ae5d17ce5c
commit c7fcaff97f
5 changed files with 173 additions and 47 deletions

View file

@ -1,5 +1,11 @@
# Revision history for small-bytearray-builder # Revision history for small-bytearray-builder
## 0.3.2.0 -- 2019-??-??
* Add `putMany`, which allows pasting into the same mutable byte
array over and over.
* Add `consLength`.
## 0.3.1.0 -- 2019-11-20 ## 0.3.1.0 -- 2019-11-20
* Add big-endian and little-endian parsers for `Word128`. This includes * Add big-endian and little-endian parsers for `Word128`. This includes

View file

@ -1,6 +1,6 @@
cabal-version: 2.2 cabal-version: 2.2
name: small-bytearray-builder name: small-bytearray-builder
version: 0.3.1.0 version: 0.3.2.0
synopsis: Serialize to a small byte arrays synopsis: Serialize to a small byte arrays
description: description:
This is similar to the builder facilities provided by This is similar to the builder facilities provided by
@ -47,6 +47,7 @@ library
, bytestring >=0.10.8.2 && <0.11 , bytestring >=0.10.8.2 && <0.11
, natural-arithmetic >=0.1 && <0.2 , natural-arithmetic >=0.1 && <0.2
, primitive-offset >=0.2 && <0.3 , primitive-offset >=0.2 && <0.3
, primitive-unlifted >=0.1.2 && <0.2
, run-st >=0.1 && <0.2 , run-st >=0.1 && <0.2
, text-short >=0.1.3 && <0.2 , text-short >=0.1.3 && <0.2
, wide-word >=0.1.0.9 && <0.2 , wide-word >=0.1.0.9 && <0.2
@ -74,6 +75,7 @@ test-suite test
, bytestring , bytestring
, natural-arithmetic , natural-arithmetic
, primitive , primitive
, primitive-unlifted >=0.1.2
, quickcheck-classes >=0.6.4 , quickcheck-classes >=0.6.4
, small-bytearray-builder , small-bytearray-builder
, tasty >=1.2.3 && <1.3 , tasty >=1.2.3 && <1.3

View file

@ -13,6 +13,7 @@ module Data.ByteArray.Builder
, fromBounded , fromBounded
-- * Evaluation -- * Evaluation
, run , run
, putMany
-- * Materialized Byte Sequences -- * Materialized Byte Sequences
, bytes , bytes
, copy , copy
@ -88,6 +89,7 @@ module Data.ByteArray.Builder
, int32ArrayLE , int32ArrayLE
, int16ArrayLE , int16ArrayLE
-- ** Prefixing with Length -- ** Prefixing with Length
, consLength
, consLength32LE , consLength32LE
, consLength32BE , consLength32BE
, consLength64BE , consLength64BE
@ -100,6 +102,7 @@ module Data.ByteArray.Builder
import Control.Monad.ST (ST,runST) import Control.Monad.ST (ST,runST)
import Data.ByteArray.Builder.Unsafe (Builder(Builder)) import Data.ByteArray.Builder.Unsafe (Builder(Builder))
import Data.ByteArray.Builder.Unsafe (BuilderState(BuilderState),pasteIO)
import Data.ByteArray.Builder.Unsafe (Commits(Initial,Mutable,Immutable)) import Data.ByteArray.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
import Data.ByteArray.Builder.Unsafe (reverseCommitsOntoChunks) import Data.ByteArray.Builder.Unsafe (reverseCommitsOntoChunks)
import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring) import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring)
@ -107,14 +110,17 @@ import Data.ByteString.Short.Internal (ShortByteString(SBS))
import Data.Bytes.Chunks (Chunks(ChunksNil)) import Data.Bytes.Chunks (Chunks(ChunksNil))
import Data.Bytes.Types (Bytes(Bytes)) import Data.Bytes.Types (Bytes(Bytes))
import Data.Char (ord) import Data.Char (ord)
import Data.Foldable (foldlM)
import Data.Int (Int64,Int32,Int16,Int8) import Data.Int (Int64,Int32,Int16,Int8)
import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..)) import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..))
import Data.Primitive.Unlifted.Array (MutableUnliftedArray,UnliftedArray)
import Data.Text.Short (ShortText) import Data.Text.Short (ShortText)
import Data.WideWord (Word128) import Data.WideWord (Word128)
import Data.Word (Word64,Word32,Word16,Word8) import Data.Word (Word64,Word32,Word16,Word8)
import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder) import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#)) import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#))
import GHC.Exts (MutableByteArray#,(+#),(-#),(<#)) import GHC.Exts (RealWorld,MutableByteArray#,(+#),(-#),(<#))
import GHC.IO (IO(IO))
import GHC.ST (ST(ST)) import GHC.ST (ST(ST))
import qualified Arithmetic.Nat as Nat import qualified Arithmetic.Nat as Nat
@ -122,6 +128,7 @@ import qualified Arithmetic.Types as Arithmetic
import qualified Data.ByteArray.Builder.Bounded as Bounded import qualified Data.ByteArray.Builder.Bounded as Bounded
import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
import qualified Data.Primitive as PM import qualified Data.Primitive as PM
import qualified Data.Primitive.Unlifted.Array as PM
import qualified Data.Text.Short as TS import qualified Data.Text.Short as TS
import qualified GHC.Exts as Exts import qualified GHC.Exts as Exts
@ -137,6 +144,73 @@ run hint@(I# hint# ) (Builder f) = runST $ do
(# s1, Mutable bufX offX csX #) (# s1, Mutable bufX offX csX #)
reverseCommitsOntoChunks ChunksNil cs reverseCommitsOntoChunks ChunksNil cs
-- | Run a builder against lots of elements. This fills the same
-- underlying buffer over and over again. Do not let the argument to
-- the callback escape from the callback (i.e. do not write it to an
-- @IORef@). Also, do not @unsafeFreezeByteArray@ any of the mutable
-- byte arrays in the callback. The intent is that the callback will
-- write the buffers out, preferably using vectored I/O.
putMany :: Foldable f
=> Int -- ^ Size of shared chunk (use 8176 if uncertain)
-> (a -> Builder) -- ^ Value builder
-> f a -- ^ Collection of values
-> (UnliftedArray (MutableByteArray RealWorld) -> IO b) -- ^ Consume chunks.
-> IO ()
{-# inline putMany #-}
putMany hint@(I# hint#) g xs cb = do
MutableByteArray buf0 <- PM.newByteArray hint
BuilderState bufZ offZ _ cmtsZ <- foldlM
(\st0 a -> do
st1@(BuilderState buf off _ cmts) <- pasteIO (g a) st0
case cmts of
Initial -> pure st1
_ -> do
_ <- cb =<< commitsToArray buf off cmts
pure (BuilderState buf0 0# hint# Initial)
) (BuilderState buf0 0# hint# Initial) xs
_ <- cb =<< commitsToArray bufZ offZ cmtsZ
pure ()
commitsToArray ::
MutableByteArray# RealWorld -- final chunk to append to commits
-> Int# -- offset
-> Commits RealWorld
-> IO (UnliftedArray (MutableByteArray RealWorld))
commitsToArray buf off cmts = do
let ct = countCommits 1 cmts
bufs <- PM.unsafeNewUnliftedArray ct
-- Only shrink the last chunk. Crucially, this is never the first
-- chunk (except on the commitsToArray call at the end of folding
-- over the collection). We only perform this shrink in the hopes
-- that a future GHC will allow reclaiming bytes from shrunk arrays.
shrinkMutableByteArray (MutableByteArray buf) (I# off)
PM.writeUnliftedArray bufs (ct - 1) (MutableByteArray buf)
writeCommitsToArray (ct - 2) bufs cmts
PM.unsafeFreezeUnliftedArray bufs
-- See the documentation for putMany.
writeCommitsToArray ::
Int
-> MutableUnliftedArray RealWorld (MutableByteArray RealWorld)
-> Commits RealWorld
-> IO ()
writeCommitsToArray !ix !arrs x0 = case x0 of
Initial -> pure ()
Mutable buf _ x1 -> do
PM.writeUnliftedArray arrs ix (MutableByteArray buf)
writeCommitsToArray (ix - 1) arrs x1
Immutable arr off len x1 -> do
buf <- PM.newByteArray (I# len)
PM.copyByteArray buf 0 (ByteArray arr) (I# off) (I# len)
PM.writeUnliftedArray arrs ix buf
writeCommitsToArray (ix - 1) arrs x1
countCommits :: Int -> Commits s -> Int
countCommits !n x0 = case x0 of
Initial -> n
Mutable _ _ x1 -> countCommits (n + 1) x1
Immutable _ _ _ x1 -> countCommits (n + 1) x1
-- | Convert a bounded builder to an unbounded one. If the size -- | Convert a bounded builder to an unbounded one. If the size
-- is a constant, use @Arithmetic.Nat.constant@ as the first argument -- is a constant, use @Arithmetic.Nat.constant@ as the first argument
-- to let GHC conjure up this value for you. -- to let GHC conjure up this value for you.
@ -603,7 +677,7 @@ word8PaddedUpperHex w =
ascii :: Char -> Builder ascii :: Char -> Builder
ascii c = fromBoundedOne (Bounded.ascii c) ascii c = fromBoundedOne (Bounded.ascii c)
-- | Encode an UTF8 char. This only uses as much space as is required. -- | Encode a UTF-8 char. This only uses as much space as is required.
char :: Char -> Builder char :: Char -> Builder
char c = fromBounded Nat.constant (Bounded.char c) char c = fromBounded Nat.constant (Bounded.char c)
@ -684,33 +758,43 @@ word16BE w = fromBounded Nat.constant (Bounded.word16BE w)
word8 :: Word8 -> Builder word8 :: Word8 -> Builder
word8 w = fromBoundedOne (Bounded.word8 w) word8 w = fromBoundedOne (Bounded.word8 w)
-- | Variant of 'consLength32BE' the encodes the length in -- | Prefix a builder with the number of bytes that it requires.
-- a little-endian fashion. consLength ::
consLength32LE :: Builder -> Builder Arithmetic.Nat n -- ^ Number of bytes used by the serialization of the length
consLength32LE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 -> -> (Int -> Bounded.Builder n) -- ^ Length serialization function
let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 4# of -> Builder -- ^ Builder whose length is measured
-> Builder
{-# inline consLength #-}
consLength !n buildSize (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
let !(I# lenSz) = Nat.demote n
!(# s1, buf1, off1, len1, cs1 #) = case len0 >=# lenSz of
1# -> (# s0, buf0, off0, len0, cs0 #) 1# -> (# s0, buf0, off0, len0, cs0 #)
_ -> case Exts.newByteArray# 4080# s0 of _ -> case Exts.newByteArray# 4080# s0 of
(# sX, bufX #) -> (# sX, bufX #) ->
(# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #) (# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
in case f buf1 (off1 +# 4# ) (len1 -# 4# ) cs1 s1 of in case f buf1 (off1 +# lenSz) (len1 -# lenSz) cs1 s1 of
(# s2, buf2, off2, len2, cs2 #) -> (# s2, buf2, off2, len2, cs2 #) ->
let !dist = case Exts.sameMutableByteArray# buf1 buf2 of let !dist = case Exts.sameMutableByteArray# buf1 buf2 of
1# -> off2 -# off1 1# -> off2 -# off1
_ -> commitDistance buf1 off2 cs2 -# off1 _ -> commitDistance buf1 off2 cs2 -# off1
ST g = UnsafeBounded.pasteST ST g = UnsafeBounded.pasteST
(Bounded.word32LE (fromIntegral (I# (dist -# 4# )))) (buildSize (fromIntegral (I# (dist -# lenSz))))
(MutableByteArray buf1) (MutableByteArray buf1)
(I# off1) (I# off1)
in case g s2 of in case g s2 of
(# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #) (# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #)
-- | Variant of 'consLength32BE' the encodes the length in
-- a little-endian fashion.
consLength32LE :: Builder -> Builder
consLength32LE = consLength Nat.constant (\x -> Bounded.word32LE (fromIntegral x))
-- | Prefix a builder with its size in bytes. This size is -- | Prefix a builder with its size in bytes. This size is
-- presented as a big-endian 32-bit word. The need to prefix -- presented as a big-endian 32-bit word. The need to prefix
-- a builder with its length shows up a numbers of wire protocols -- a builder with its length shows up a numbers of wire protocols
-- including those of PostgreSQL and Apache Kafka. Note the -- including those of PostgreSQL and Apache Kafka. Note the
-- equivalence: -- equivalence:
-- --
-- > forall (n :: Int) (x :: Builder). -- > forall (n :: Int) (x :: Builder).
-- > let sz = sizeofByteArray (run n (consLength32BE x)) -- > let sz = sizeofByteArray (run n (consLength32BE x))
-- > consLength32BE x === word32BE (fromIntegral sz) <> x -- > consLength32BE x === word32BE (fromIntegral sz) <> x
@ -718,45 +802,12 @@ consLength32LE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
-- However, using 'consLength32BE' is much more efficient here -- However, using 'consLength32BE' is much more efficient here
-- since it only materializes the 'ByteArray' once. -- since it only materializes the 'ByteArray' once.
consLength32BE :: Builder -> Builder consLength32BE :: Builder -> Builder
consLength32BE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 -> consLength32BE = consLength Nat.constant (\x -> Bounded.word32BE (fromIntegral x))
let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 4# of
1# -> (# s0, buf0, off0, len0, cs0 #)
_ -> case Exts.newByteArray# 4080# s0 of
(# sX, bufX #) ->
(# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
in case f buf1 (off1 +# 4# ) (len1 -# 4# ) cs1 s1 of
(# s2, buf2, off2, len2, cs2 #) ->
let !dist = case Exts.sameMutableByteArray# buf1 buf2 of
1# -> off2 -# off1
_ -> commitDistance buf1 off2 cs2 -# off1
ST g = UnsafeBounded.pasteST
(Bounded.word32BE (fromIntegral (I# (dist -# 4# ))))
(MutableByteArray buf1)
(I# off1)
in case g s2 of
(# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #)
-- | Prefix a builder with its size in bytes. This size is -- | Prefix a builder with its size in bytes. This size is
-- presented as a big-endian 64-bit word. See 'consLength32BE'. -- presented as a big-endian 64-bit word. See 'consLength32BE'.
consLength64BE :: Builder -> Builder consLength64BE :: Builder -> Builder
consLength64BE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 -> consLength64BE = consLength Nat.constant (\x -> Bounded.word64BE (fromIntegral x))
let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 8# of
1# -> (# s0, buf0, off0, len0, cs0 #)
_ -> case Exts.newByteArray# 4080# s0 of
(# sX, bufX #) ->
(# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
in case f buf1 (off1 +# 8# ) (len1 -# 8# ) cs1 s1 of
(# s2, buf2, off2, len2, cs2 #) ->
let !dist = case Exts.sameMutableByteArray# buf1 buf2 of
1# -> off2 -# off1
_ -> commitDistance buf1 off2 cs2 -# off1
ST g = UnsafeBounded.pasteST
(Bounded.word64BE (fromIntegral (I# (dist -# 8# ))))
(MutableByteArray buf1)
(I# off1)
in case g s2 of
(# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #)
commitDistance :: MutableByteArray# s -> Int# -> Commits s -> Int# commitDistance :: MutableByteArray# s -> Int# -> Commits s -> Int#
commitDistance _ !_ Initial = error "chunkDistance: chunk not found" commitDistance _ !_ Initial = error "chunkDistance: chunk not found"
@ -796,3 +847,7 @@ indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
c2w :: Char -> Word8 c2w :: Char -> Word8
c2w = fromIntegral . ord c2w = fromIntegral . ord
shrinkMutableByteArray :: MutableByteArray RealWorld -> Int -> IO ()
shrinkMutableByteArray (MutableByteArray x) (I# i) =
IO (\s -> (# Exts.shrinkMutableByteArray# x i s, ()#))

View file

@ -9,7 +9,11 @@
module Data.ByteArray.Builder.Unsafe module Data.ByteArray.Builder.Unsafe
( -- * Types ( -- * Types
Builder(..) Builder(..)
, BuilderState(..)
, Commits(..) , Commits(..)
-- * Execution
, pasteST
, pasteIO
-- * Construction -- * Construction
, fromEffect , fromEffect
-- * Finalization -- * Finalization
@ -30,8 +34,9 @@ import Foreign.C.String (CString)
import GHC.Base (unpackCString#,unpackCStringUtf8#) import GHC.Base (unpackCString#,unpackCStringUtf8#)
import GHC.Exts ((-#),(+#),(>#),(>=#)) import GHC.Exts ((-#),(+#),(>#),(>=#))
import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr)) import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr))
import GHC.Exts (IsString,Int#,State#) import GHC.Exts (RealWorld,IsString,Int#,State#)
import GHC.ST (ST(ST)) import GHC.ST (ST(ST))
import GHC.IO (stToIO)
import qualified Data.ByteArray.Builder.Bounded as Bounded import qualified Data.ByteArray.Builder.Bounded as Bounded
import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
@ -50,6 +55,27 @@ newtype Builder
(# State# s, MutableByteArray# s, Int#, Int#, Commits s #) -- all the same things (# State# s, MutableByteArray# s, Int#, Int#, Commits s #) -- all the same things
) )
data BuilderState s = BuilderState
(MutableByteArray# s) -- buffer we are currently writing to
Int# -- offset into the current buffer
Int# -- number of bytes remaining in the current buffer
!(Commits s) -- buffers and immutable byte slices that are already committed
-- | Run a builder, performing an in-place update on the state.
-- The @BuilderState@ argument must not be reused after being passed
-- to this function. That is, its use must be affine.
pasteST :: Builder -> BuilderState s -> ST s (BuilderState s)
{-# inline pasteST #-}
pasteST (Builder f) (BuilderState buf off len cmts) = ST $ \s0 ->
case f buf off len cmts s0 of
(# s1, buf1, off1, len1, cmts1 #) ->
(# s1, BuilderState buf1 off1 len1 cmts1 #)
-- | Variant of 'pasteST' that runs in 'IO'.
pasteIO :: Builder -> BuilderState RealWorld -> IO (BuilderState RealWorld)
{-# inline pasteIO #-}
pasteIO b st = stToIO (pasteST b st)
instance IsString Builder where instance IsString Builder where
{-# inline fromString #-} {-# inline fromString #-}
fromString = stringUtf8 fromString = stringUtf8

View file

@ -13,6 +13,7 @@ import Data.Bytes.Chunks (Chunks(ChunksNil,ChunksCons))
import Data.Primitive (PrimArray) import Data.Primitive (PrimArray)
import Data.Word import Data.Word
import Data.Char (ord,chr) import Data.Char (ord,chr)
import Data.IORef (IORef,newIORef,readIORef,writeIORef)
import Data.Primitive (ByteArray) import Data.Primitive (ByteArray)
import Data.Proxy (Proxy(..)) import Data.Proxy (Proxy(..))
import Data.WideWord (Word128(Word128)) import Data.WideWord (Word128(Word128))
@ -28,6 +29,7 @@ import qualified Data.ByteString.Lazy.Char8 as LB
import qualified Data.Bytes.Chunks as Chunks import qualified Data.Bytes.Chunks as Chunks
import qualified Data.List as L import qualified Data.List as L
import qualified Data.Primitive as PM import qualified Data.Primitive as PM
import qualified Data.Primitive.Unlifted.Array as PM
import qualified Data.Text as T import qualified Data.Text as T
import qualified Data.Text.Encoding as TE import qualified Data.Text.Encoding as TE
import qualified GHC.Exts as Exts import qualified GHC.Exts as Exts
@ -197,8 +199,40 @@ tests = testGroup "Tests"
, lawsToTest (QCC.semigroupLaws (Proxy :: Proxy Chunks)) , lawsToTest (QCC.semigroupLaws (Proxy :: Proxy Chunks))
, lawsToTest (QCC.monoidLaws (Proxy :: Proxy Chunks)) , lawsToTest (QCC.monoidLaws (Proxy :: Proxy Chunks))
] ]
, testGroup "putMany"
[ THU.testCase "A" $ do
ref <- newIORef []
let txt = "hello_world_are_you_listening" :: [Char]
putMany 7 ascii txt (ontoRef ref)
res <- readIORef ref
id $
[ map c2w "hello_w"
, map c2w "o"
, map c2w "rld_are"
, map c2w "_"
, map c2w "you_lis"
, map c2w "t"
, map c2w "ening"
] @=? map Exts.toList (Exts.toList res)
]
] ]
ontoRef ::
IORef [PM.ByteArray]
-> PM.UnliftedArray (PM.MutableByteArray Exts.RealWorld)
-> IO ()
ontoRef !ref xs = do
rs <- readIORef ref
ps <- PM.foldlUnliftedArrayM'
(\ys buf -> do
len <- PM.getSizeofMutableByteArray buf
dst <- PM.newByteArray len
PM.copyMutableByteArray dst 0 buf 0 len
dst' <- PM.unsafeFreezeByteArray dst
pure (ys ++ [dst'])
) [] xs
writeIORef ref (rs ++ ps)
instance Arbitrary Chunks where instance Arbitrary Chunks where
arbitrary = do arbitrary = do
xs :: [[Word8]] <- TQC.arbitrary xs :: [[Word8]] <- TQC.arbitrary
@ -232,5 +266,8 @@ showWord64PaddedUpperHex = printf "%016X"
runConcat :: Int -> Builder -> ByteArray runConcat :: Int -> Builder -> ByteArray
runConcat n = Chunks.concat . run n runConcat n = Chunks.concat . run n
c2w :: Char -> Word8
c2w = fromIntegral . ord
instance Arbitrary Word128 where instance Arbitrary Word128 where
arbitrary = liftA2 Word128 TQC.arbitrary TQC.arbitrary arbitrary = liftA2 Word128 TQC.arbitrary TQC.arbitrary