Add flush, copy, and insert

This commit is contained in:
Andrew Martin 2019-10-10 09:10:44 -04:00
parent 8baf4cc369
commit c84f6cbcca
4 changed files with 100 additions and 21 deletions

View file

@ -1,11 +1,13 @@
# Revision history for small-bytearray-builder # Revision history for small-bytearray-builder
## 0.2.2.0 -- 2019-??-?? ## 0.3.0.0 -- 2019-??-??
* Introduce `consLensBE32` for efficient serialization of wire protocols * Introduce `consLensBE32` for efficient serialization of wire protocols
that require prefixing a payload with its length. that require prefixing a payload with its length.
* Add `int64BE` as a convenience. * Add `int64BE` as a convenience.
* Add little-endian encoding functions for `Word16`, `Word32`, and `Word64`. * Add little-endian encoding functions for `Word16`, `Word32`, and `Word64`.
* Add `flush`, `copy`, and `insert` for better control when
converting byte sequences to builders.
## 0.2.1.0 -- 2019-09-05 ## 0.2.1.0 -- 2019-09-05

View file

@ -46,7 +46,6 @@ library
, byteslice >=0.1 && <0.2 , byteslice >=0.1 && <0.2
, primitive-offset >=0.2 && <0.3 , primitive-offset >=0.2 && <0.3
, run-st >=0.1 && <0.2 , run-st >=0.1 && <0.2
, vector >=0.12.0.3 && <0.13
, bytestring >=0.10.8.2 && <0.11 , bytestring >=0.10.8.2 && <0.11
, text-short >=0.1.3 && <0.2 , text-short >=0.1.3 && <0.2
, natural-arithmetic >=0.1 && <0.2 , natural-arithmetic >=0.1 && <0.2

View file

@ -15,6 +15,8 @@ module Data.ByteArray.Builder
, run , run
-- * Materialized Byte Sequences -- * Materialized Byte Sequences
, bytes , bytes
, copy
, insert
, byteArray , byteArray
, shortTextUtf8 , shortTextUtf8
, shortTextJsonString , shortTextJsonString
@ -62,39 +64,41 @@ module Data.ByteArray.Builder
, int64LE , int64LE
-- *** Many -- *** Many
, word8Array , word8Array
-- **** Little Endian
, word16ArrayLE
-- ** Prefixing with Length -- ** Prefixing with Length
, consLength32BE , consLength32BE
, consLength64BE , consLength64BE
-- * Encode Floating-Point Types -- * Encode Floating-Point Types
-- ** Human-Readable -- ** Human-Readable
, doubleDec , doubleDec
-- * Control
, flush
) where ) where
import Control.Monad.Primitive (primitive_) import Control.Monad.Primitive (primitive_)
import Control.Monad.ST (ST,stToIO,runST) import Control.Monad.ST (ST,runST)
import Control.Monad.ST.Run (runByteArrayST)
import Data.ByteArray.Builder.Unsafe (Builder(Builder)) import Data.ByteArray.Builder.Unsafe (Builder(Builder))
import Data.ByteArray.Builder.Unsafe (Commits(Initial,Mutable,Immutable)) import Data.ByteArray.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring) import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring)
import Data.ByteString.Short.Internal (ShortByteString(SBS)) import Data.ByteString.Short.Internal (ShortByteString(SBS))
import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes)) import Data.Bytes.Types (Bytes(Bytes))
import Data.Char (ord) import Data.Char (ord)
import Data.Int (Int64,Int32,Int16,Int8) import Data.Int (Int64,Int32,Int16,Int8)
import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..)) import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..))
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
import Data.Text.Short (ShortText) import Data.Text.Short (ShortText)
import Data.Word (Word64,Word32,Word16,Word8) import Data.Word (Word64,Word32,Word16,Word8)
import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,RealWorld,(>=#),(/=#)) import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#))
import GHC.Exts (MutableByteArray#,(+#),(-#),(<#)) import GHC.Exts (MutableByteArray#,(+#),(-#),(<#))
import GHC.ST (ST(ST)) import GHC.ST (ST(ST))
import Data.Bytes.Chunks (Chunks(..)) import Data.Bytes.Chunks (Chunks(..))
import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
import qualified Arithmetic.Nat as Nat import qualified Arithmetic.Nat as Nat
import qualified Arithmetic.Types as Arithmetic import qualified Arithmetic.Types as Arithmetic
import qualified GHC.Exts as Exts import qualified GHC.Exts as Exts
import qualified Data.Text.Short as TS import qualified Data.Text.Short as TS
import qualified Data.Primitive as PM import qualified Data.Primitive as PM
import qualified Data.Vector as V
import qualified Data.ByteArray.Builder.Bounded as Bounded import qualified Data.ByteArray.Builder.Bounded as Bounded
import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
@ -164,18 +168,52 @@ fromBoundedOne (UnsafeBounded.Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
byteArray :: ByteArray -> Builder byteArray :: ByteArray -> Builder
byteArray a = bytes (Bytes a 0 (PM.sizeofByteArray a)) byteArray a = bytes (Bytes a 0 (PM.sizeofByteArray a))
-- | Create a builder from a sliced byte sequence. -- | Create a builder from a sliced byte sequence. The variants
-- 'copy' and 'insert' provide more control over whether or not
-- the byte sequence is copied or aliased. This function is preferred
-- when the user does not know the size of the byte sequence.
bytes :: Bytes -> Builder bytes :: Bytes -> Builder
bytes (Bytes (ByteArray src# ) (I# soff# ) slen@(I# slen# )) = Builder bytes (Bytes (ByteArray src# ) (I# soff# ) (I# slen# )) = Builder
(\buf0 off0 len0 cs0 s0 -> if slen >= 1024 -- There are three cases to consider: (1) there is not enough
then case Exts.newByteArray# 0# s0 of -- space and (1a) the chunk is not small or (1b) the chunk is
(# s1, buf1 #) -> (# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #) -- small; (2) There is enough space for a copy.
else case len0 <# slen# of (\buf0 off0 len0 cs0 s0 -> case len0 <# slen# of
1# -> case Exts.newByteArray# 4080# s0 of 1# -> case slen# >=# 256# of
(# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of 1# -> case Exts.newByteArray# 0# s0 of
s2 -> (# s2, buf1, slen#, 4080# -# slen#, Mutable buf0 off0 cs0 #) (# s1, buf1 #) -> (# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #)
_ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in _ -> case Exts.newByteArray# 4080# s0 of
(# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #) (# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of
s2 -> (# s2, buf1, slen#, 4080# -# slen#, Mutable buf0 off0 cs0 #)
_ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in
(# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #)
)
-- | Create a builder from a byte sequence. This always results in a
-- call to @memcpy@. This is beneficial when the byte sequence is
-- known to be small (less than 256 bytes).
copy :: Bytes -> Builder
copy (Bytes (ByteArray src# ) (I# soff# ) (I# slen# )) = Builder
(\buf0 off0 len0 cs0 s0 -> case len0 <# slen# of
1# -> case Exts.newByteArray# newSz s0 of
(# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of
s2 -> (# s2, buf1, slen#, newSz -# slen#, Mutable buf0 off0 cs0 #)
_ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in
(# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #)
)
where
!(I# newSz) = max (I# slen#) 4080
-- | Create a builder from a byte sequence. This never calls @memcpy@.
-- Instead, it pushes a chunk that references the argument byte sequence.
-- This wastes the remaining space in the active chunk, so it may adversely
-- affect performance if used carelessly. See 'flush' for a way to mitigate
-- this problem. This functions is most beneficial when the byte sequence
-- is known to be large (more than 8192 bytes).
insert :: Bytes -> Builder
insert (Bytes (ByteArray src# ) (I# soff# ) (I# slen# )) = Builder
(\buf0 off0 _ cs0 s0 -> case Exts.newByteArray# 0# s0 of
(# s1, buf1 #) ->
(# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #)
) )
-- | Create a builder from a slice of an array of 'Word8'. There is the same -- | Create a builder from a slice of an array of 'Word8'. There is the same
@ -184,6 +222,20 @@ bytes (Bytes (ByteArray src# ) (I# soff# ) slen@(I# slen# )) = Builder
word8Array :: PrimArray Word8 -> Int -> Int -> Builder word8Array :: PrimArray Word8 -> Int -> Int -> Builder
word8Array (PrimArray arr) off len = bytes (Bytes (ByteArray arr) off len) word8Array (PrimArray arr) off len = bytes (Bytes (ByteArray arr) off len)
word16ArrayLE :: PrimArray Word16 -> Int -> Int -> Builder
word16ArrayLE src@(PrimArray arr) soff0 slen0 = case targetByteOrder of
LittleEndian -> bytes (Bytes (ByteArray arr) (soff0 * 2) (slen0 * 2))
BigEndian -> fromFunction (slen0 * 2) (go soff0 (soff0 + slen0))
where
go :: Int -> Int -> MutableByteArray s -> Int -> ST s Int
go !soff !send !dst !doff = if soff < send
then do
doff' <- UnsafeBounded.pasteST
(Bounded.word16LE (PM.indexPrimArray src soff))
dst doff
go (soff + 1) send dst doff'
else pure doff
-- Internal function. Precondition, the referenced slice of the -- Internal function. Precondition, the referenced slice of the
-- byte sequence is UTF-8 encoded text. -- byte sequence is UTF-8 encoded text.
slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
@ -502,6 +554,25 @@ commitDistance target !n (Mutable buf len cs) =
1# -> n +# len 1# -> n +# len
_ -> commitDistance target (n +# len) cs _ -> commitDistance target (n +# len) cs
-- | Push the buffer currently being filled onto the chunk list,
-- allocating a new active buffer of the requested size. This is
-- helpful when a small builder is sandwhiched between two large
-- zero-copy builders:
--
-- > insert bigA <> flush 1 <> word8 0x42 <> insert bigB
--
-- Without @flush 1@, @word8 0x42@ would see the zero-byte active
-- buffer that 'insert' returned, decide that it needed more space,
-- and allocate a 4080-byte buffer to which only a single byte
-- would be written.
flush :: Int -> Builder
flush !reqSz = Builder $ \buf0 off0 _ cs0 s0 ->
case Exts.newByteArray# sz# s0 of
(# sX, bufX #) ->
(# sX, bufX, 0#, sz#, Mutable buf0 off0 cs0 #)
where
!(I# sz# ) = max reqSz 0
-- ShortText is already UTF-8 encoded. This is a no-op. -- ShortText is already UTF-8 encoded. This is a no-op.
shortTextToByteArray :: ShortText -> ByteArray shortTextToByteArray :: ShortText -> ByteArray
shortTextToByteArray x = case TS.toShortByteString x of shortTextToByteArray x = case TS.toShortByteString x of

View file

@ -23,7 +23,6 @@ import qualified Data.List as L
import qualified Data.Primitive as PM import qualified Data.Primitive as PM
import qualified Data.Text as T import qualified Data.Text as T
import qualified Data.Text.Encoding as TE import qualified Data.Text.Encoding as TE
import qualified Data.Vector as V
import qualified GHC.Exts as Exts import qualified GHC.Exts as Exts
import qualified Test.Tasty.HUnit as THU import qualified Test.Tasty.HUnit as THU
import qualified Test.Tasty.QuickCheck as TQC import qualified Test.Tasty.QuickCheck as TQC
@ -63,7 +62,7 @@ tests = testGroup "Tests"
runConcat 1 (consLength32BE (word8Dec w)) runConcat 1 (consLength32BE (word8Dec w))
=== ===
pack ('\x00' : '\x00' : '\x00' : chr (L.length (show w)) : show w) pack ('\x00' : '\x00' : '\x00' : chr (L.length (show w)) : show w)
, TQC.testProperty "consLength64BE" $ \w -> , TQC.testProperty "consLength64BE-uni" $ \w ->
pack pack
( '\x00' : '\x00' : '\x00' : '\x00' ( '\x00' : '\x00' : '\x00' : '\x00'
: '\x00' : '\x00' : '\x00' : chr (L.length (show w)) : '\x00' : '\x00' : '\x00' : chr (L.length (show w))
@ -71,6 +70,14 @@ tests = testGroup "Tests"
) )
=== ===
runConcat 1 (consLength64BE (word16Dec w)) runConcat 1 (consLength64BE (word16Dec w))
, TQC.testProperty "consLength64BE-multi" $ \w ->
pack
( '\x00' : '\x00' : '\x00' : '\x00'
: '\x00' : '\x00' : '\x00' : chr (1 + L.length (show w))
: '\x42' : show w
)
===
runConcat 1 (consLength64BE (word8 0x42 <> flush 2 <> word16Dec w))
, THU.testCase "stringUtf8" $ , THU.testCase "stringUtf8" $
packUtf8 "¿Cómo estás? I am doing well." @=? packUtf8 "¿Cómo estás? I am doing well." @=?
runConcat 1 (stringUtf8 "¿Cómo estás? I am doing well.") runConcat 1 (stringUtf8 "¿Cómo estás? I am doing well.")