From c84f6cbccaac39b07250757cfc7c44f0017d10da Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Thu, 10 Oct 2019 09:10:44 -0400 Subject: [PATCH] Add flush, copy, and insert --- CHANGELOG.md | 4 +- small-bytearray-builder.cabal | 1 - src/Data/ByteArray/Builder.hs | 105 ++++++++++++++++++++++++++++------ test/Main.hs | 11 +++- 4 files changed, 100 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc027df..d51176e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,13 @@ # Revision history for small-bytearray-builder -## 0.2.2.0 -- 2019-??-?? +## 0.3.0.0 -- 2019-??-?? * Introduce `consLensBE32` for efficient serialization of wire protocols that require prefixing a payload with its length. * Add `int64BE` as a convenience. * Add little-endian encoding functions for `Word16`, `Word32`, and `Word64`. +* Add `flush`, `copy`, and `insert` for better control when + converting byte sequences to builders. ## 0.2.1.0 -- 2019-09-05 diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal index 5203e65..97fba3c 100644 --- a/small-bytearray-builder.cabal +++ b/small-bytearray-builder.cabal @@ -46,7 +46,6 @@ library , byteslice >=0.1 && <0.2 , primitive-offset >=0.2 && <0.3 , run-st >=0.1 && <0.2 - , vector >=0.12.0.3 && <0.13 , bytestring >=0.10.8.2 && <0.11 , text-short >=0.1.3 && <0.2 , natural-arithmetic >=0.1 && <0.2 diff --git a/src/Data/ByteArray/Builder.hs b/src/Data/ByteArray/Builder.hs index fbea1da..1c7e59e 100644 --- a/src/Data/ByteArray/Builder.hs +++ b/src/Data/ByteArray/Builder.hs @@ -15,6 +15,8 @@ module Data.ByteArray.Builder , run -- * Materialized Byte Sequences , bytes + , copy + , insert , byteArray , shortTextUtf8 , shortTextJsonString @@ -62,39 +64,41 @@ module Data.ByteArray.Builder , int64LE -- *** Many , word8Array + -- **** Little Endian + , word16ArrayLE -- ** Prefixing with Length , consLength32BE , consLength64BE -- * Encode Floating-Point Types -- ** Human-Readable , doubleDec + -- * Control + , flush ) where import Control.Monad.Primitive (primitive_) -import Control.Monad.ST (ST,stToIO,runST) -import Control.Monad.ST.Run (runByteArrayST) +import Control.Monad.ST (ST,runST) import Data.ByteArray.Builder.Unsafe (Builder(Builder)) import Data.ByteArray.Builder.Unsafe (Commits(Initial,Mutable,Immutable)) import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring) import Data.ByteString.Short.Internal (ShortByteString(SBS)) -import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes)) +import Data.Bytes.Types (Bytes(Bytes)) import Data.Char (ord) import Data.Int (Int64,Int32,Int16,Int8) import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..)) -import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) import Data.Text.Short (ShortText) import Data.Word (Word64,Word32,Word16,Word8) -import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,RealWorld,(>=#),(/=#)) +import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#)) import GHC.Exts (MutableByteArray#,(+#),(-#),(<#)) import GHC.ST (ST(ST)) import Data.Bytes.Chunks (Chunks(..)) +import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder) import qualified Arithmetic.Nat as Nat import qualified Arithmetic.Types as Arithmetic import qualified GHC.Exts as Exts import qualified Data.Text.Short as TS import qualified Data.Primitive as PM -import qualified Data.Vector as V import qualified Data.ByteArray.Builder.Bounded as Bounded import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded @@ -164,18 +168,52 @@ fromBoundedOne (UnsafeBounded.Builder f) = Builder $ \buf0 off0 len0 cs0 s0 -> byteArray :: ByteArray -> Builder byteArray a = bytes (Bytes a 0 (PM.sizeofByteArray a)) --- | Create a builder from a sliced byte sequence. +-- | Create a builder from a sliced byte sequence. The variants +-- 'copy' and 'insert' provide more control over whether or not +-- the byte sequence is copied or aliased. This function is preferred +-- when the user does not know the size of the byte sequence. bytes :: Bytes -> Builder -bytes (Bytes (ByteArray src# ) (I# soff# ) slen@(I# slen# )) = Builder - (\buf0 off0 len0 cs0 s0 -> if slen >= 1024 - then case Exts.newByteArray# 0# s0 of - (# s1, buf1 #) -> (# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #) - else case len0 <# slen# of - 1# -> case Exts.newByteArray# 4080# s0 of - (# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of - s2 -> (# s2, buf1, slen#, 4080# -# slen#, Mutable buf0 off0 cs0 #) - _ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in - (# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #) +bytes (Bytes (ByteArray src# ) (I# soff# ) (I# slen# )) = Builder + -- There are three cases to consider: (1) there is not enough + -- space and (1a) the chunk is not small or (1b) the chunk is + -- small; (2) There is enough space for a copy. + (\buf0 off0 len0 cs0 s0 -> case len0 <# slen# of + 1# -> case slen# >=# 256# of + 1# -> case Exts.newByteArray# 0# s0 of + (# s1, buf1 #) -> (# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #) + _ -> case Exts.newByteArray# 4080# s0 of + (# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of + s2 -> (# s2, buf1, slen#, 4080# -# slen#, Mutable buf0 off0 cs0 #) + _ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in + (# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #) + ) + +-- | Create a builder from a byte sequence. This always results in a +-- call to @memcpy@. This is beneficial when the byte sequence is +-- known to be small (less than 256 bytes). +copy :: Bytes -> Builder +copy (Bytes (ByteArray src# ) (I# soff# ) (I# slen# )) = Builder + (\buf0 off0 len0 cs0 s0 -> case len0 <# slen# of + 1# -> case Exts.newByteArray# newSz s0 of + (# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of + s2 -> (# s2, buf1, slen#, newSz -# slen#, Mutable buf0 off0 cs0 #) + _ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in + (# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #) + ) + where + !(I# newSz) = max (I# slen#) 4080 + +-- | Create a builder from a byte sequence. This never calls @memcpy@. +-- Instead, it pushes a chunk that references the argument byte sequence. +-- This wastes the remaining space in the active chunk, so it may adversely +-- affect performance if used carelessly. See 'flush' for a way to mitigate +-- this problem. This functions is most beneficial when the byte sequence +-- is known to be large (more than 8192 bytes). +insert :: Bytes -> Builder +insert (Bytes (ByteArray src# ) (I# soff# ) (I# slen# )) = Builder + (\buf0 off0 _ cs0 s0 -> case Exts.newByteArray# 0# s0 of + (# s1, buf1 #) -> + (# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #) ) -- | Create a builder from a slice of an array of 'Word8'. There is the same @@ -184,6 +222,20 @@ bytes (Bytes (ByteArray src# ) (I# soff# ) slen@(I# slen# )) = Builder word8Array :: PrimArray Word8 -> Int -> Int -> Builder word8Array (PrimArray arr) off len = bytes (Bytes (ByteArray arr) off len) +word16ArrayLE :: PrimArray Word16 -> Int -> Int -> Builder +word16ArrayLE src@(PrimArray arr) soff0 slen0 = case targetByteOrder of + LittleEndian -> bytes (Bytes (ByteArray arr) (soff0 * 2) (slen0 * 2)) + BigEndian -> fromFunction (slen0 * 2) (go soff0 (soff0 + slen0)) + where + go :: Int -> Int -> MutableByteArray s -> Int -> ST s Int + go !soff !send !dst !doff = if soff < send + then do + doff' <- UnsafeBounded.pasteST + (Bounded.word16LE (PM.indexPrimArray src soff)) + dst doff + go (soff + 1) send dst doff' + else pure doff + -- Internal function. Precondition, the referenced slice of the -- byte sequence is UTF-8 encoded text. slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder @@ -502,6 +554,25 @@ commitDistance target !n (Mutable buf len cs) = 1# -> n +# len _ -> commitDistance target (n +# len) cs +-- | Push the buffer currently being filled onto the chunk list, +-- allocating a new active buffer of the requested size. This is +-- helpful when a small builder is sandwhiched between two large +-- zero-copy builders: +-- +-- > insert bigA <> flush 1 <> word8 0x42 <> insert bigB +-- +-- Without @flush 1@, @word8 0x42@ would see the zero-byte active +-- buffer that 'insert' returned, decide that it needed more space, +-- and allocate a 4080-byte buffer to which only a single byte +-- would be written. +flush :: Int -> Builder +flush !reqSz = Builder $ \buf0 off0 _ cs0 s0 -> + case Exts.newByteArray# sz# s0 of + (# sX, bufX #) -> + (# sX, bufX, 0#, sz#, Mutable buf0 off0 cs0 #) + where + !(I# sz# ) = max reqSz 0 + -- ShortText is already UTF-8 encoded. This is a no-op. shortTextToByteArray :: ShortText -> ByteArray shortTextToByteArray x = case TS.toShortByteString x of diff --git a/test/Main.hs b/test/Main.hs index 9e8001d..f78e56b 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -23,7 +23,6 @@ import qualified Data.List as L import qualified Data.Primitive as PM import qualified Data.Text as T import qualified Data.Text.Encoding as TE -import qualified Data.Vector as V import qualified GHC.Exts as Exts import qualified Test.Tasty.HUnit as THU import qualified Test.Tasty.QuickCheck as TQC @@ -63,7 +62,7 @@ tests = testGroup "Tests" runConcat 1 (consLength32BE (word8Dec w)) === pack ('\x00' : '\x00' : '\x00' : chr (L.length (show w)) : show w) - , TQC.testProperty "consLength64BE" $ \w -> + , TQC.testProperty "consLength64BE-uni" $ \w -> pack ( '\x00' : '\x00' : '\x00' : '\x00' : '\x00' : '\x00' : '\x00' : chr (L.length (show w)) @@ -71,6 +70,14 @@ tests = testGroup "Tests" ) === runConcat 1 (consLength64BE (word16Dec w)) + , TQC.testProperty "consLength64BE-multi" $ \w -> + pack + ( '\x00' : '\x00' : '\x00' : '\x00' + : '\x00' : '\x00' : '\x00' : chr (1 + L.length (show w)) + : '\x42' : show w + ) + === + runConcat 1 (consLength64BE (word8 0x42 <> flush 2 <> word16Dec w)) , THU.testCase "stringUtf8" $ packUtf8 "¿Cómo estás? I am doing well." @=? runConcat 1 (stringUtf8 "¿Cómo estás? I am doing well.")