Add putMany and considate consLength functions

This commit is contained in:
Andrew Martin 2019-11-22 16:24:48 -05:00
parent ae5d17ce5c
commit c7fcaff97f
5 changed files with 173 additions and 47 deletions

View file

@ -1,5 +1,11 @@
# Revision history for small-bytearray-builder
## 0.3.2.0 -- 2019-??-??
* Add `putMany`, which allows pasting into the same mutable byte
array over and over.
* Add `consLength`.
## 0.3.1.0 -- 2019-11-20
* Add big-endian and little-endian parsers for `Word128`. This includes

View file

@ -1,6 +1,6 @@
cabal-version: 2.2
name: small-bytearray-builder
version: 0.3.1.0
version: 0.3.2.0
synopsis: Serialize to a small byte arrays
description:
This is similar to the builder facilities provided by
@ -47,6 +47,7 @@ library
, bytestring >=0.10.8.2 && <0.11
, natural-arithmetic >=0.1 && <0.2
, primitive-offset >=0.2 && <0.3
, primitive-unlifted >=0.1.2 && <0.2
, run-st >=0.1 && <0.2
, text-short >=0.1.3 && <0.2
, wide-word >=0.1.0.9 && <0.2
@ -74,6 +75,7 @@ test-suite test
, bytestring
, natural-arithmetic
, primitive
, primitive-unlifted >=0.1.2
, quickcheck-classes >=0.6.4
, small-bytearray-builder
, tasty >=1.2.3 && <1.3

View file

@ -13,6 +13,7 @@ module Data.ByteArray.Builder
, fromBounded
-- * Evaluation
, run
, putMany
-- * Materialized Byte Sequences
, bytes
, copy
@ -88,6 +89,7 @@ module Data.ByteArray.Builder
, int32ArrayLE
, int16ArrayLE
-- ** Prefixing with Length
, consLength
, consLength32LE
, consLength32BE
, consLength64BE
@ -100,6 +102,7 @@ module Data.ByteArray.Builder
import Control.Monad.ST (ST,runST)
import Data.ByteArray.Builder.Unsafe (Builder(Builder))
import Data.ByteArray.Builder.Unsafe (BuilderState(BuilderState),pasteIO)
import Data.ByteArray.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
import Data.ByteArray.Builder.Unsafe (reverseCommitsOntoChunks)
import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring)
@ -107,14 +110,17 @@ import Data.ByteString.Short.Internal (ShortByteString(SBS))
import Data.Bytes.Chunks (Chunks(ChunksNil))
import Data.Bytes.Types (Bytes(Bytes))
import Data.Char (ord)
import Data.Foldable (foldlM)
import Data.Int (Int64,Int32,Int16,Int8)
import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..))
import Data.Primitive.Unlifted.Array (MutableUnliftedArray,UnliftedArray)
import Data.Text.Short (ShortText)
import Data.WideWord (Word128)
import Data.Word (Word64,Word32,Word16,Word8)
import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#))
import GHC.Exts (MutableByteArray#,(+#),(-#),(<#))
import GHC.Exts (RealWorld,MutableByteArray#,(+#),(-#),(<#))
import GHC.IO (IO(IO))
import GHC.ST (ST(ST))
import qualified Arithmetic.Nat as Nat
@ -122,6 +128,7 @@ import qualified Arithmetic.Types as Arithmetic
import qualified Data.ByteArray.Builder.Bounded as Bounded
import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
import qualified Data.Primitive as PM
import qualified Data.Primitive.Unlifted.Array as PM
import qualified Data.Text.Short as TS
import qualified GHC.Exts as Exts
@ -137,6 +144,73 @@ run hint@(I# hint# ) (Builder f) = runST $ do
(# s1, Mutable bufX offX csX #)
reverseCommitsOntoChunks ChunksNil cs
-- | Run a builder against lots of elements. This fills the same
-- underlying buffer over and over again. Do not let the argument to
-- the callback escape from the callback (i.e. do not write it to an
-- @IORef@). Also, do not @unsafeFreezeByteArray@ any of the mutable
-- byte arrays in the callback. The intent is that the callback will
-- write the buffers out, preferably using vectored I/O.
putMany :: Foldable f
=> Int -- ^ Size of shared chunk (use 8176 if uncertain)
-> (a -> Builder) -- ^ Value builder
-> f a -- ^ Collection of values
-> (UnliftedArray (MutableByteArray RealWorld) -> IO b) -- ^ Consume chunks.
-> IO ()
{-# inline putMany #-}
putMany hint@(I# hint#) g xs cb = do
MutableByteArray buf0 <- PM.newByteArray hint
BuilderState bufZ offZ _ cmtsZ <- foldlM
(\st0 a -> do
st1@(BuilderState buf off _ cmts) <- pasteIO (g a) st0
case cmts of
Initial -> pure st1
_ -> do
_ <- cb =<< commitsToArray buf off cmts
pure (BuilderState buf0 0# hint# Initial)
) (BuilderState buf0 0# hint# Initial) xs
_ <- cb =<< commitsToArray bufZ offZ cmtsZ
pure ()
commitsToArray ::
MutableByteArray# RealWorld -- final chunk to append to commits
-> Int# -- offset
-> Commits RealWorld
-> IO (UnliftedArray (MutableByteArray RealWorld))
commitsToArray buf off cmts = do
let ct = countCommits 1 cmts
bufs <- PM.unsafeNewUnliftedArray ct
-- Only shrink the last chunk. Crucially, this is never the first
-- chunk (except on the commitsToArray call at the end of folding
-- over the collection). We only perform this shrink in the hopes
-- that a future GHC will allow reclaiming bytes from shrunk arrays.
shrinkMutableByteArray (MutableByteArray buf) (I# off)
PM.writeUnliftedArray bufs (ct - 1) (MutableByteArray buf)
writeCommitsToArray (ct - 2) bufs cmts
PM.unsafeFreezeUnliftedArray bufs
-- See the documentation for putMany.
writeCommitsToArray ::
Int
-> MutableUnliftedArray RealWorld (MutableByteArray RealWorld)
-> Commits RealWorld
-> IO ()
writeCommitsToArray !ix !arrs x0 = case x0 of
Initial -> pure ()
Mutable buf _ x1 -> do
PM.writeUnliftedArray arrs ix (MutableByteArray buf)
writeCommitsToArray (ix - 1) arrs x1
Immutable arr off len x1 -> do
buf <- PM.newByteArray (I# len)
PM.copyByteArray buf 0 (ByteArray arr) (I# off) (I# len)
PM.writeUnliftedArray arrs ix buf
writeCommitsToArray (ix - 1) arrs x1
countCommits :: Int -> Commits s -> Int
countCommits !n x0 = case x0 of
Initial -> n
Mutable _ _ x1 -> countCommits (n + 1) x1
Immutable _ _ _ x1 -> countCommits (n + 1) x1
-- | Convert a bounded builder to an unbounded one. If the size
-- is a constant, use @Arithmetic.Nat.constant@ as the first argument
-- to let GHC conjure up this value for you.
@ -603,7 +677,7 @@ word8PaddedUpperHex w =
ascii :: Char -> Builder
ascii c = fromBoundedOne (Bounded.ascii c)
-- | Encode an UTF8 char. This only uses as much space as is required.
-- | Encode a UTF-8 char. This only uses as much space as is required.
char :: Char -> Builder
char c = fromBounded Nat.constant (Bounded.char c)
@ -684,33 +758,43 @@ word16BE w = fromBounded Nat.constant (Bounded.word16BE w)
word8 :: Word8 -> Builder
word8 w = fromBoundedOne (Bounded.word8 w)
-- | Variant of 'consLength32BE' the encodes the length in
-- a little-endian fashion.
consLength32LE :: Builder -> Builder
consLength32LE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 4# of
-- | Prefix a builder with the number of bytes that it requires.
consLength ::
Arithmetic.Nat n -- ^ Number of bytes used by the serialization of the length
-> (Int -> Bounded.Builder n) -- ^ Length serialization function
-> Builder -- ^ Builder whose length is measured
-> Builder
{-# inline consLength #-}
consLength !n buildSize (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
let !(I# lenSz) = Nat.demote n
!(# s1, buf1, off1, len1, cs1 #) = case len0 >=# lenSz of
1# -> (# s0, buf0, off0, len0, cs0 #)
_ -> case Exts.newByteArray# 4080# s0 of
(# sX, bufX #) ->
(# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
in case f buf1 (off1 +# 4# ) (len1 -# 4# ) cs1 s1 of
in case f buf1 (off1 +# lenSz) (len1 -# lenSz) cs1 s1 of
(# s2, buf2, off2, len2, cs2 #) ->
let !dist = case Exts.sameMutableByteArray# buf1 buf2 of
1# -> off2 -# off1
_ -> commitDistance buf1 off2 cs2 -# off1
ST g = UnsafeBounded.pasteST
(Bounded.word32LE (fromIntegral (I# (dist -# 4# ))))
(buildSize (fromIntegral (I# (dist -# lenSz))))
(MutableByteArray buf1)
(I# off1)
in case g s2 of
(# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #)
-- | Variant of 'consLength32BE' the encodes the length in
-- a little-endian fashion.
consLength32LE :: Builder -> Builder
consLength32LE = consLength Nat.constant (\x -> Bounded.word32LE (fromIntegral x))
-- | Prefix a builder with its size in bytes. This size is
-- presented as a big-endian 32-bit word. The need to prefix
-- a builder with its length shows up a numbers of wire protocols
-- including those of PostgreSQL and Apache Kafka. Note the
-- equivalence:
--
--
-- > forall (n :: Int) (x :: Builder).
-- > let sz = sizeofByteArray (run n (consLength32BE x))
-- > consLength32BE x === word32BE (fromIntegral sz) <> x
@ -718,45 +802,12 @@ consLength32LE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
-- However, using 'consLength32BE' is much more efficient here
-- since it only materializes the 'ByteArray' once.
consLength32BE :: Builder -> Builder
consLength32BE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 4# of
1# -> (# s0, buf0, off0, len0, cs0 #)
_ -> case Exts.newByteArray# 4080# s0 of
(# sX, bufX #) ->
(# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
in case f buf1 (off1 +# 4# ) (len1 -# 4# ) cs1 s1 of
(# s2, buf2, off2, len2, cs2 #) ->
let !dist = case Exts.sameMutableByteArray# buf1 buf2 of
1# -> off2 -# off1
_ -> commitDistance buf1 off2 cs2 -# off1
ST g = UnsafeBounded.pasteST
(Bounded.word32BE (fromIntegral (I# (dist -# 4# ))))
(MutableByteArray buf1)
(I# off1)
in case g s2 of
(# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #)
consLength32BE = consLength Nat.constant (\x -> Bounded.word32BE (fromIntegral x))
-- | Prefix a builder with its size in bytes. This size is
-- presented as a big-endian 64-bit word. See 'consLength32BE'.
consLength64BE :: Builder -> Builder
consLength64BE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 8# of
1# -> (# s0, buf0, off0, len0, cs0 #)
_ -> case Exts.newByteArray# 4080# s0 of
(# sX, bufX #) ->
(# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
in case f buf1 (off1 +# 8# ) (len1 -# 8# ) cs1 s1 of
(# s2, buf2, off2, len2, cs2 #) ->
let !dist = case Exts.sameMutableByteArray# buf1 buf2 of
1# -> off2 -# off1
_ -> commitDistance buf1 off2 cs2 -# off1
ST g = UnsafeBounded.pasteST
(Bounded.word64BE (fromIntegral (I# (dist -# 8# ))))
(MutableByteArray buf1)
(I# off1)
in case g s2 of
(# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #)
consLength64BE = consLength Nat.constant (\x -> Bounded.word64BE (fromIntegral x))
commitDistance :: MutableByteArray# s -> Int# -> Commits s -> Int#
commitDistance _ !_ Initial = error "chunkDistance: chunk not found"
@ -796,3 +847,7 @@ indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
c2w :: Char -> Word8
c2w = fromIntegral . ord
shrinkMutableByteArray :: MutableByteArray RealWorld -> Int -> IO ()
shrinkMutableByteArray (MutableByteArray x) (I# i) =
IO (\s -> (# Exts.shrinkMutableByteArray# x i s, ()#))

View file

@ -9,7 +9,11 @@
module Data.ByteArray.Builder.Unsafe
( -- * Types
Builder(..)
, BuilderState(..)
, Commits(..)
-- * Execution
, pasteST
, pasteIO
-- * Construction
, fromEffect
-- * Finalization
@ -30,8 +34,9 @@ import Foreign.C.String (CString)
import GHC.Base (unpackCString#,unpackCStringUtf8#)
import GHC.Exts ((-#),(+#),(>#),(>=#))
import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr))
import GHC.Exts (IsString,Int#,State#)
import GHC.Exts (RealWorld,IsString,Int#,State#)
import GHC.ST (ST(ST))
import GHC.IO (stToIO)
import qualified Data.ByteArray.Builder.Bounded as Bounded
import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
@ -50,6 +55,27 @@ newtype Builder
(# State# s, MutableByteArray# s, Int#, Int#, Commits s #) -- all the same things
)
data BuilderState s = BuilderState
(MutableByteArray# s) -- buffer we are currently writing to
Int# -- offset into the current buffer
Int# -- number of bytes remaining in the current buffer
!(Commits s) -- buffers and immutable byte slices that are already committed
-- | Run a builder, performing an in-place update on the state.
-- The @BuilderState@ argument must not be reused after being passed
-- to this function. That is, its use must be affine.
pasteST :: Builder -> BuilderState s -> ST s (BuilderState s)
{-# inline pasteST #-}
pasteST (Builder f) (BuilderState buf off len cmts) = ST $ \s0 ->
case f buf off len cmts s0 of
(# s1, buf1, off1, len1, cmts1 #) ->
(# s1, BuilderState buf1 off1 len1 cmts1 #)
-- | Variant of 'pasteST' that runs in 'IO'.
pasteIO :: Builder -> BuilderState RealWorld -> IO (BuilderState RealWorld)
{-# inline pasteIO #-}
pasteIO b st = stToIO (pasteST b st)
instance IsString Builder where
{-# inline fromString #-}
fromString = stringUtf8

View file

@ -13,6 +13,7 @@ import Data.Bytes.Chunks (Chunks(ChunksNil,ChunksCons))
import Data.Primitive (PrimArray)
import Data.Word
import Data.Char (ord,chr)
import Data.IORef (IORef,newIORef,readIORef,writeIORef)
import Data.Primitive (ByteArray)
import Data.Proxy (Proxy(..))
import Data.WideWord (Word128(Word128))
@ -28,6 +29,7 @@ import qualified Data.ByteString.Lazy.Char8 as LB
import qualified Data.Bytes.Chunks as Chunks
import qualified Data.List as L
import qualified Data.Primitive as PM
import qualified Data.Primitive.Unlifted.Array as PM
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified GHC.Exts as Exts
@ -197,8 +199,40 @@ tests = testGroup "Tests"
, lawsToTest (QCC.semigroupLaws (Proxy :: Proxy Chunks))
, lawsToTest (QCC.monoidLaws (Proxy :: Proxy Chunks))
]
, testGroup "putMany"
[ THU.testCase "A" $ do
ref <- newIORef []
let txt = "hello_world_are_you_listening" :: [Char]
putMany 7 ascii txt (ontoRef ref)
res <- readIORef ref
id $
[ map c2w "hello_w"
, map c2w "o"
, map c2w "rld_are"
, map c2w "_"
, map c2w "you_lis"
, map c2w "t"
, map c2w "ening"
] @=? map Exts.toList (Exts.toList res)
]
]
ontoRef ::
IORef [PM.ByteArray]
-> PM.UnliftedArray (PM.MutableByteArray Exts.RealWorld)
-> IO ()
ontoRef !ref xs = do
rs <- readIORef ref
ps <- PM.foldlUnliftedArrayM'
(\ys buf -> do
len <- PM.getSizeofMutableByteArray buf
dst <- PM.newByteArray len
PM.copyMutableByteArray dst 0 buf 0 len
dst' <- PM.unsafeFreezeByteArray dst
pure (ys ++ [dst'])
) [] xs
writeIORef ref (rs ++ ps)
instance Arbitrary Chunks where
arbitrary = do
xs :: [[Word8]] <- TQC.arbitrary
@ -232,5 +266,8 @@ showWord64PaddedUpperHex = printf "%016X"
runConcat :: Int -> Builder -> ByteArray
runConcat n = Chunks.concat . run n
c2w :: Char -> Word8
c2w = fromIntegral . ord
instance Arbitrary Word128 where
arbitrary = liftA2 Word128 TQC.arbitrary TQC.arbitrary