Add flush, copy, and insert

2019-10-10 09:10:44 -04:00 · 2019-10-10 09:10:44 -04:00 · c84f6cbcca
commit c84f6cbcca
parent 8baf4cc369
4 changed files with 100 additions and 21 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,11 +1,13 @@
 # Revision history for small-bytearray-builder
-## 0.2.2.0 -- 2019-??-??
+## 0.3.0.0 -- 2019-??-??
 * Introduce `consLensBE32` for efficient serialization of wire protocols
  that require prefixing a payload with its length.
 * Add `int64BE` as a convenience.
 * Add little-endian encoding functions for `Word16`, `Word32`, and `Word64`.
 * Add `flush`, `copy`, and `insert` for better control when
  converting byte sequences to builders.
 ## 0.2.1.0 -- 2019-09-05
--- a/small-bytearray-builder.cabal
+++ b/small-bytearray-builder.cabal
@ -46,7 +46,6 @@ library
    , byteslice >=0.1 && <0.2
    , primitive-offset >=0.2 && <0.3
    , run-st >=0.1 && <0.2
    , vector >=0.12.0.3 && <0.13
    , bytestring >=0.10.8.2 && <0.11
    , text-short >=0.1.3 && <0.2
    , natural-arithmetic >=0.1 && <0.2
--- a/src/Data/ByteArray/Builder.hs
+++ b/src/Data/ByteArray/Builder.hs
@ -15,6 +15,8 @@ module Data.ByteArray.Builder
  , run
    -- * Materialized Byte Sequences
  , bytes
  , copy
  , insert
  , byteArray
  , shortTextUtf8
  , shortTextJsonString
@ -62,39 +64,41 @@ module Data.ByteArray.Builder
  , int64LE
    -- *** Many
  , word8Array
    -- **** Little Endian
  , word16ArrayLE
    -- ** Prefixing with Length
  , consLength32BE
  , consLength64BE
    -- * Encode Floating-Point Types
    -- ** Human-Readable
  , doubleDec
    -- * Control
  , flush
  ) where
 import Control.Monad.Primitive (primitive_)
-import Control.Monad.ST (ST,stToIO,runST)
+import Control.Monad.ST (ST,runST)
 import Control.Monad.ST.Run (runByteArrayST)
 import Data.ByteArray.Builder.Unsafe (Builder(Builder))
 import Data.ByteArray.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
 import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring)
 import Data.ByteString.Short.Internal (ShortByteString(SBS))
-import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes))
+import Data.Bytes.Types (Bytes(Bytes))
 import Data.Char (ord)
 import Data.Int (Int64,Int32,Int16,Int8)
 import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..))
 import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
 import Data.Text.Short (ShortText)
 import Data.Word (Word64,Word32,Word16,Word8)
-import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,RealWorld,(>=#),(/=#))
+import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#))
 import GHC.Exts (MutableByteArray#,(+#),(-#),(<#))
 import GHC.ST (ST(ST))
 import Data.Bytes.Chunks (Chunks(..))
 import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
 import qualified Arithmetic.Nat as Nat
 import qualified Arithmetic.Types as Arithmetic
 import qualified GHC.Exts as Exts
 import qualified Data.Text.Short as TS
 import qualified Data.Primitive as PM
 import qualified Data.Vector as V
 import qualified Data.ByteArray.Builder.Bounded as Bounded
 import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
@ -164,18 +168,52 @@ fromBoundedOne (UnsafeBounded.Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
 byteArray :: ByteArray -> Builder
 byteArray a = bytes (Bytes a 0 (PM.sizeofByteArray a))
-- | Create a builder from a sliced byte sequence.
+-- | Create a builder from a sliced byte sequence. The variants
 -- 'copy' and 'insert' provide more control over whether or not
 -- the byte sequence is copied or aliased. This function is preferred
 -- when the user does not know the size of the byte sequence.
 bytes :: Bytes -> Builder
-bytes (Bytes (ByteArray src# ) (I# soff# ) slen@(I# slen# )) = Builder
+bytes (Bytes (ByteArray src# ) (I# soff# ) (I# slen# )) = Builder
-  (\buf0 off0 len0 cs0 s0 -> if slen >= 1024
+  -- There are three cases to consider: (1) there is not enough
-    then case Exts.newByteArray# 0# s0 of
+  -- space and (1a) the chunk is not small or (1b) the chunk is
-      (# s1, buf1 #) -> (# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #)
+  -- small; (2) There is enough space for a copy.
-    else case len0 <# slen# of
+  (\buf0 off0 len0 cs0 s0 -> case len0 <# slen# of
-      1# -> case Exts.newByteArray# 4080# s0 of
+    1# -> case slen# >=# 256# of
-          (# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of
+      1# -> case Exts.newByteArray# 0# s0 of
-            s2 -> (# s2, buf1, slen#, 4080# -# slen#, Mutable buf0 off0 cs0 #)
+        (# s1, buf1 #) -> (# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #)
-      _ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in
+      _ -> case Exts.newByteArray# 4080# s0 of
-        (# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #)
+        (# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of
          s2 -> (# s2, buf1, slen#, 4080# -# slen#, Mutable buf0 off0 cs0 #)
    _ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in
      (# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #)
  )
 -- | Create a builder from a byte sequence. This always results in a
 -- call to @memcpy@. This is beneficial when the byte sequence is
 -- known to be small (less than 256 bytes).
 copy :: Bytes -> Builder
 copy (Bytes (ByteArray src# ) (I# soff# ) (I# slen# )) = Builder
  (\buf0 off0 len0 cs0 s0 -> case len0 <# slen# of
    1# -> case Exts.newByteArray# newSz s0 of
        (# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of
          s2 -> (# s2, buf1, slen#, newSz -# slen#, Mutable buf0 off0 cs0 #)
    _ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in
      (# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #)
  )
  where
  !(I# newSz) = max (I# slen#) 4080
 -- | Create a builder from a byte sequence. This never calls @memcpy@.
 -- Instead, it pushes a chunk that references the argument byte sequence.
 -- This wastes the remaining space in the active chunk, so it may adversely
 -- affect performance if used carelessly. See 'flush' for a way to mitigate
 -- this problem. This functions is most beneficial when the byte sequence
 -- is known to be large (more than 8192 bytes).
 insert :: Bytes -> Builder
 insert (Bytes (ByteArray src# ) (I# soff# ) (I# slen# )) = Builder
  (\buf0 off0 _ cs0 s0 -> case Exts.newByteArray# 0# s0 of
    (# s1, buf1 #) ->
      (# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #)
  )
 -- | Create a builder from a slice of an array of 'Word8'. There is the same
@ -184,6 +222,20 @@ bytes (Bytes (ByteArray src# ) (I# soff# ) slen@(I# slen# )) = Builder
 word8Array :: PrimArray Word8 -> Int -> Int -> Builder
 word8Array (PrimArray arr) off len = bytes (Bytes (ByteArray arr) off len)
 word16ArrayLE :: PrimArray Word16 -> Int -> Int -> Builder
 word16ArrayLE src@(PrimArray arr) soff0 slen0 = case targetByteOrder of
  LittleEndian -> bytes (Bytes (ByteArray arr) (soff0 * 2) (slen0 * 2))
  BigEndian -> fromFunction (slen0 * 2) (go soff0 (soff0 + slen0))
  where
  go :: Int -> Int -> MutableByteArray s -> Int -> ST s Int
  go !soff !send !dst !doff = if soff < send
    then do
      doff' <- UnsafeBounded.pasteST
        (Bounded.word16LE (PM.indexPrimArray src soff))
        dst doff
      go (soff + 1) send dst doff'
    else pure doff
 -- Internal function. Precondition, the referenced slice of the
 -- byte sequence is UTF-8 encoded text.
 slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
@ -502,6 +554,25 @@ commitDistance target !n (Mutable buf len cs) =
    1# -> n +# len
    _ -> commitDistance target (n +# len) cs
 -- | Push the buffer currently being filled onto the chunk list,
 -- allocating a new active buffer of the requested size. This is
 -- helpful when a small builder is sandwhiched between two large
 -- zero-copy builders:
 --
 -- > insert bigA <> flush 1 <> word8 0x42 <> insert bigB
 --
 -- Without @flush 1@, @word8 0x42@ would see the zero-byte active
 -- buffer that 'insert' returned, decide that it needed more space,
 -- and allocate a 4080-byte buffer to which only a single byte
 -- would be written.
 flush :: Int -> Builder
 flush !reqSz = Builder $ \buf0 off0 _ cs0 s0 ->
  case Exts.newByteArray# sz# s0 of
    (# sX, bufX #) ->
      (# sX, bufX, 0#, sz#, Mutable buf0 off0 cs0 #)
  where
  !(I# sz# ) = max reqSz 0
 -- ShortText is already UTF-8 encoded. This is a no-op.
 shortTextToByteArray :: ShortText -> ByteArray
 shortTextToByteArray x = case TS.toShortByteString x of
--- a/test/Main.hs
+++ b/test/Main.hs
@ -23,7 +23,6 @@ import qualified Data.List as L
 import qualified Data.Primitive as PM
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as TE
 import qualified Data.Vector as V
 import qualified GHC.Exts as Exts
 import qualified Test.Tasty.HUnit as THU
 import qualified Test.Tasty.QuickCheck as TQC
@ -63,7 +62,7 @@ tests = testGroup "Tests"
        runConcat 1 (consLength32BE (word8Dec w))
        ===
        pack ('\x00' : '\x00' : '\x00' : chr (L.length (show w)) : show w)
-    , TQC.testProperty "consLength64BE" $ \w ->
+    , TQC.testProperty "consLength64BE-uni" $ \w ->
        pack
          ( '\x00' : '\x00' : '\x00' : '\x00'
          : '\x00' : '\x00' : '\x00' : chr (L.length (show w))
@ -71,6 +70,14 @@ tests = testGroup "Tests"
          )
        ===
        runConcat 1 (consLength64BE (word16Dec w))
    , TQC.testProperty "consLength64BE-multi" $ \w ->
        pack
          ( '\x00' : '\x00' : '\x00' : '\x00'
          : '\x00' : '\x00' : '\x00' : chr (1 + L.length (show w))
          : '\x42' : show w
          )
        ===
        runConcat 1 (consLength64BE (word8 0x42 <> flush 2 <> word16Dec w))
    , THU.testCase "stringUtf8" $
        packUtf8 "¿Cómo estás? I am doing well." @=?
          runConcat 1 (stringUtf8 "¿Cómo estás? I am doing well.")