From 8baf4cc3696b3b3b989093c636b7821d995f9119 Mon Sep 17 00:00:00 2001
From: Andrew Martin <andrew.thaddeus@gmail.com>
Date: Wed, 9 Oct 2019 16:30:02 -0400
Subject: [PATCH] Switch builder implementation to use chunks (#10)

* switch builder implementation to use chunks
* micro-optimize the conditional for builders of length one
* use fromBoundedOne for ascii chars
---
 common/Word16Tree.hs                 |   1 +
 small-bytearray-builder.cabal        |   3 +-
 src/Data/ByteArray/Builder.hs        | 320 +++++++++++++--------------
 src/Data/ByteArray/Builder/Unsafe.hs |  87 +++++---
 src/Data/Bytes/Chunks.hs             |  76 +++++++
 test/Main.hs                         | 109 +++++----
 6 files changed, 347 insertions(+), 249 deletions(-)
 create mode 100644 src/Data/Bytes/Chunks.hs

diff --git a/common/Word16Tree.hs b/common/Word16Tree.hs
index ed042e0..1b4e6ec 100644
--- a/common/Word16Tree.hs
+++ b/common/Word16Tree.hs
@@ -35,6 +35,7 @@ expectedSmall :: ByteArray
 expectedSmall = Bytes.toByteArray $ Bytes.fromAsciiString
   "((AB59,(1F33,2E71)),((((FA9A,247B),890C),(0F13,((55BF,7CF1),389B))),1205))"
 
+
 exampleSmall :: Word16Tree
 exampleSmall = Branch
   (Branch
diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal
index 05f70ea..5203e65 100644
--- a/small-bytearray-builder.cabal
+++ b/small-bytearray-builder.cabal
@@ -36,6 +36,7 @@ flag checked
 
 library
   exposed-modules:
+    Data.Bytes.Chunks
     Data.ByteArray.Builder
     Data.ByteArray.Builder.Unsafe
     Data.ByteArray.Builder.Bounded
@@ -90,7 +91,7 @@ benchmark bench
     , small-bytearray-builder
     , text-short
     , byteslice
-  ghc-options: -Wall -O2
+  ghc-options: -Wall -O2 -ddump-to-file -ddump-simpl -dsuppress-all
   default-language: Haskell2010
   hs-source-dirs: bench, common
   main-is: Main.hs
diff --git a/src/Data/ByteArray/Builder.hs b/src/Data/ByteArray/Builder.hs
index 56566b2..fbea1da 100644
--- a/src/Data/ByteArray/Builder.hs
+++ b/src/Data/ByteArray/Builder.hs
@@ -1,4 +1,5 @@
 {-# language BangPatterns #-}
+{-# language DataKinds #-}
 {-# language DuplicateRecordFields #-}
 {-# language LambdaCase #-}
 {-# language MagicHash #-}
@@ -9,19 +10,12 @@
 module Data.ByteArray.Builder
   ( -- * Bounded Primitives
     Builder
-  , construct
   , fromBounded
     -- * Evaluation
   , run
-  , pasteST
-  , pasteIO
-  , pasteGrowST
-  , pasteGrowIO
-  , pasteArrayST
-  , pasteArrayIO
     -- * Materialized Byte Sequences
   , bytes
-  , bytearray
+  , byteArray
   , shortTextUtf8
   , shortTextJsonString
   , cstring
@@ -77,9 +71,10 @@ module Data.ByteArray.Builder
   ) where
 
 import Control.Monad.Primitive (primitive_)
-import Control.Monad.ST (ST,stToIO)
+import Control.Monad.ST (ST,stToIO,runST)
 import Control.Monad.ST.Run (runByteArrayST)
 import Data.ByteArray.Builder.Unsafe (Builder(Builder))
+import Data.ByteArray.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
 import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring)
 import Data.ByteString.Short.Internal (ShortByteString(SBS))
 import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes))
@@ -90,8 +85,9 @@ import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
 import Data.Text.Short (ShortText)
 import Data.Word (Word64,Word32,Word16,Word8)
 import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,RealWorld,(>=#),(/=#))
-import GHC.Exts ((+#),(-#))
+import GHC.Exts (MutableByteArray#,(+#),(-#),(<#))
 import GHC.ST (ST(ST))
+import Data.Bytes.Chunks (Chunks(..))
 
 import qualified Arithmetic.Nat as Nat
 import qualified Arithmetic.Types as Arithmetic
@@ -102,105 +98,31 @@ import qualified Data.Vector as V
 import qualified Data.ByteArray.Builder.Bounded as Bounded
 import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
 
--- | Run a builder. An accurate size hint is important for
--- good performance. The size hint should be slightly greater
--- than or equal to the actual size.
+-- | Run a builder.
 run ::
-     Int -- ^ Hint for upper bound on size
+     Int -- ^ Size of initial chunk (use 4080 if uncertain)
   -> Builder -- ^ Builder
-  -> ByteArray
-run hint b = runByteArrayST $ do
-  let go !n = do
-        arr <- PM.newByteArray n
-        pasteST b (MutableBytes arr 0 n) >>= \case
-          Nothing -> go (n + n + 16)
-          Just len -> do
-            shrinkMutableByteArray arr len
-            PM.unsafeFreezeByteArray arr
-  go (max hint 16)
+  -> Chunks
+run hint@(I# hint# ) (Builder f) = runST $ do
+  MutableByteArray buf0 <- PM.newByteArray hint
+  cs <- ST $ \s0 -> case f buf0 0# hint# Initial s0 of
+    (# s1, bufX, offX, _, csX #) ->
+      (# s1, Mutable bufX offX csX #)
+  commitsOntoChunks ChunksNil cs
 
--- | Variant of 'pasteArrayST' that runs in 'IO'.
-pasteArrayIO ::
-     MutableBytes RealWorld -- ^ Buffer
-  -> (a -> Builder) -- ^ Builder
-  -> V.Vector a -- ^ Elements to serialize
-  -> IO (V.Vector a, MutableBytes RealWorld) -- ^ Shifted vector, shifted buffer
-pasteArrayIO !arr f !xs = stToIO (pasteArrayST arr f xs)
-
--- | Fold over a vector, applying the builder to each element until
--- the buffer cannot accomodate any more.
-pasteArrayST ::
-     MutableBytes s -- ^ Buffer
-  -> (a -> Builder) -- ^ Builder
-  -> V.Vector a -- ^ Elements to serialize
-  -> ST s (V.Vector a, MutableBytes s) -- ^ Shifted vector, shifted buffer
-pasteArrayST (MutableBytes arr off0 len0) f !xs0 = do
-  let go !xs !ixBufA !lenBufA = if V.length xs > 0
-        then do
-          let a = V.unsafeHead xs
-          pasteST (f a) (MutableBytes arr ixBufA lenBufA) >>= \case
-            Nothing -> pure (xs,MutableBytes arr ixBufA lenBufA)
-            Just ixBufB ->
-              go (V.unsafeTail xs) ixBufB (lenBufA + (ixBufA - ixBufB))
-        else pure (xs,MutableBytes arr ixBufA lenBufA)
-  go xs0 off0 len0
-
--- | Paste the builder into the byte array starting at offset zero.
--- This repeatedly reallocates the byte array if it cannot accomodate
--- the builder, replaying the builder each time.
-pasteGrowST ::
-     Int -- ^ How many bytes to grow by at a time
-  -> Builder
-  -> MutableByteArrayOffset s
-     -- ^ Initial buffer, used linearly. Do not reuse this argument.
-  -> ST s (MutableByteArrayOffset s)
-     -- ^ Final buffer that accomodated the builder.
-pasteGrowST !n b !(MutableByteArrayOffset arr0 off0) = do
-  let go !arr !sz = pasteST b (MutableBytes arr off0 (sz - off0)) >>= \case
-        Nothing -> do
-          let szNext = sz + n
-          arrNext <- PM.resizeMutableByteArray arr szNext
-          go arrNext szNext
-        Just ix -> pure (MutableByteArrayOffset{array=arr,offset=ix})
-  go arr0 =<< PM.getSizeofMutableByteArray arr0
-
--- | Variant of 'pasteGrowST' that runs in 'IO'.
-pasteGrowIO ::
-     Int -- ^ How many bytes to grow by at a time
-  -> Builder
-  -> MutableByteArrayOffset RealWorld
-     -- ^ Initial buffer, used linearly. Do not reuse this argument.
-  -> IO (MutableByteArrayOffset RealWorld)
-     -- ^ Final buffer that accomodated the builder.
-pasteGrowIO !n b !arr = stToIO (pasteGrowST n b arr)
-
--- | Execute the builder, pasting its contents into a buffer.
--- If the buffer is not large enough, this returns 'Nothing'.
--- Otherwise, it returns the index in the buffer that follows
--- the payload just written.
-pasteST :: Builder -> MutableBytes s -> ST s (Maybe Int)
-{-# inline pasteST #-}
-pasteST (Builder f) (MutableBytes (MutableByteArray arr) (I# off) (I# len)) =
-  ST $ \s0 -> case f arr off len s0 of
-    (# s1, r #) -> if Exts.isTrue# (r /=# (-1#))
-      then (# s1, Just (I# r) #)
-      else (# s1, Nothing #)
-
--- | Variant of 'pasteST' that runs in 'IO'.
-pasteIO :: Builder -> MutableBytes RealWorld -> IO (Maybe Int)
-{-# inline pasteIO #-}
-pasteIO b m = stToIO (pasteST b m)
-
--- | Constructor for 'Builder' that works on a function with lifted
--- arguments instead of unlifted ones. This is just as unsafe as the
--- actual constructor.
-construct :: (forall s. MutableBytes s -> ST s (Maybe Int)) -> Builder
-construct f = Builder
-  $ \arr off len s0 ->
-    case unST (f (MutableBytes (MutableByteArray arr) (I# off) (I# len))) s0 of
-      (# s1, m #) -> case m of
-        Nothing -> (# s1, (-1#) #)
-        Just (I# n) -> (# s1, n #)
+-- Internal. This freezes all the mutable byte arrays in-place,
+-- so be careful. It also reverses the chunks since everything
+-- is backwards.
+commitsOntoChunks :: Chunks -> Commits s -> ST s Chunks
+commitsOntoChunks !xs Initial = pure xs
+commitsOntoChunks !xs (Immutable arr off len cs) =
+  commitsOntoChunks (ChunksCons (Bytes (ByteArray arr) (I# off) (I# len)) xs) cs
+commitsOntoChunks !xs (Mutable buf len cs) = case len of
+  0# -> commitsOntoChunks xs cs
+  _ -> do
+    shrinkMutableByteArray (MutableByteArray buf) (I# len)
+    arr <- PM.unsafeFreezeByteArray (MutableByteArray buf)
+    commitsOntoChunks (ChunksCons (Bytes arr 0 (I# len)) xs) cs
 
 -- | Convert a bounded builder to an unbounded one. If the size
 -- is a constant, use @Arithmetic.Nat.constant@ as the first argument
@@ -210,23 +132,51 @@ fromBounded ::
   -> Bounded.Builder n
   -> Builder
 {-# inline fromBounded #-}
-fromBounded n (UnsafeBounded.Builder f) = Builder $ \arr off len s0 ->
-  let !(I# req) = Nat.demote n in
-  case len >=# req of
-    1# -> f arr off s0
-    _ -> (# s0, (-1#) #)
+fromBounded n (UnsafeBounded.Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
+  let !(I# req) = Nat.demote n
+      !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
+        1# -> (# s0, buf0, off0, len0, cs0 #)
+        _ -> let !(I# lenX) = max 4080 (I# req) in
+          case Exts.newByteArray# lenX s0 of
+            (# sX, bufX #) ->
+              (# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
+   in case f buf1 off1 s1 of
+        (# s2, off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
+
+-- This is a micro-optimization that uses an equality check instead
+-- of an inequality check when the required number of bytes is one.
+-- Use this instead of fromBounded (where possible) leads to marginally
+-- better results in benchmarks.
+fromBoundedOne ::
+     Bounded.Builder 1
+  -> Builder
+{-# inline fromBoundedOne #-}
+fromBoundedOne (UnsafeBounded.Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
+  let !(# s1, buf1, off1, len1, cs1 #) = case len0 of
+        0# -> case Exts.newByteArray# 4080# s0 of
+          (# sX, bufX #) ->
+            (# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
+        _ -> (# s0, buf0, off0, len0, cs0 #)
+   in case f buf1 off1 s1 of
+        (# s2, off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
 
 -- | Create a builder from an unsliced byte sequence.
-bytearray :: ByteArray -> Builder
-bytearray a = bytes (Bytes a 0 (PM.sizeofByteArray a))
+byteArray :: ByteArray -> Builder
+byteArray a = bytes (Bytes a 0 (PM.sizeofByteArray a))
 
 -- | Create a builder from a sliced byte sequence.
 bytes :: Bytes -> Builder
-bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len >= slen
-  then do
-    PM.copyByteArray arr off src soff slen
-    pure (Just (off + slen))
-  else pure Nothing
+bytes (Bytes (ByteArray src# ) (I# soff# ) slen@(I# slen# )) = Builder
+  (\buf0 off0 len0 cs0 s0 -> if slen >= 1024
+    then case Exts.newByteArray# 0# s0 of
+      (# s1, buf1 #) -> (# s1, buf1, 0#, 0#, Immutable src# soff# slen# (Mutable buf0 off0 cs0) #)
+    else case len0 <# slen# of
+      1# -> case Exts.newByteArray# 4080# s0 of
+          (# s1, buf1 #) -> case Exts.copyByteArray# src# soff# buf1 0# slen# s1 of
+            s2 -> (# s2, buf1, slen#, 4080# -# slen#, Mutable buf0 off0 cs0 #)
+      _ -> let !s1 = Exts.copyByteArray# src# soff# buf0 off0 slen# s0 in
+        (# s1, buf0, off0 +# slen#, len0 -# slen#, cs0 #)
+  )
 
 -- | Create a builder from a slice of an array of 'Word8'. There is the same
 -- as 'bytes' but is provided as a convenience for users working with different
@@ -238,31 +188,44 @@ word8Array (PrimArray arr) off len = bytes (Bytes (ByteArray arr) off len)
 -- byte sequence is UTF-8 encoded text.
 slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
 {-# inline slicedUtf8TextJson #-}
-slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 dlen0) ->
-  let slen0 = I# slen0#
-   in if dlen0 > (2 * slen0) + 2
-        then do
-          PM.writeByteArray dst doff0 (c2w '"')
-          let go !soff !slen !doff = if slen > 0
-                then case indexChar8Array (ByteArray src#) soff of
-                  '\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
-                  '\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
-                  '\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
-                  '\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
-                  '\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
-                  c -> if c >= '\x20'
-                    then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
-                    else do
-                      write2 dst doff '\\' 'u'
-                      doff' <- UnsafeBounded.pasteST
-                        (Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
-                        dst (doff + 2)
-                      go (soff + 1) (slen - 1) doff'
-                else pure doff
-          doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
-          PM.writeByteArray dst doffRes (c2w '"')
-          pure (Just (doffRes + 1))
-        else pure Nothing
+slicedUtf8TextJson !src# !soff0# !slen0# = fromFunction reqLen $ \dst doff0 -> do
+  PM.writeByteArray dst doff0 (c2w '"')
+  let go !soff !slen !doff = if slen > 0
+        then case indexChar8Array (ByteArray src#) soff of
+          '\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
+          '\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
+          '\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
+          '\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
+          '\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
+          c -> if c >= '\x20'
+            then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
+            else do
+              write2 dst doff '\\' 'u'
+              doff' <- UnsafeBounded.pasteST
+                (Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
+                dst (doff + 2)
+              go (soff + 1) (slen - 1) doff'
+        else pure doff
+  doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
+  PM.writeByteArray dst doffRes (c2w '"')
+  pure (doffRes + 1)
+  where
+  slen0 = I# slen0#
+  reqLen = (2 * slen0) + 2
+
+-- | Constructor for 'Builder' that works on a function with lifted
+-- arguments instead of unlifted ones. This is just as unsafe as the
+-- actual constructor.
+fromFunction :: Int -> (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder
+fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 ->
+  let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
+        1# -> (# s0, buf0, off0, len0, cs0 #)
+        _ -> let !(I# lenX) = max 4080 (I# req) in
+          case Exts.newByteArray# lenX s0 of
+            (# sX, bufX #) ->
+              (# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
+   in case unST (f (MutableByteArray buf1) (I# off1)) s1 of
+        (# s2, I# off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
 
 -- Internal. Write two characters in the ASCII plane to a byte array.
 write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
@@ -420,7 +383,7 @@ word8PaddedUpperHex w =
 -- | Encode an ASCII char.
 -- Precondition: Input must be an ASCII character. This is not checked.
 ascii :: Char -> Builder
-ascii c = fromBounded Nat.constant (Bounded.ascii c)
+ascii c = fromBoundedOne (Bounded.ascii c)
 
 -- | Encode an UTF8 char. This only uses as much space as is required.
 char :: Char -> Builder
@@ -475,7 +438,7 @@ word16BE w = fromBounded Nat.constant (Bounded.word16BE w)
 
 -- | Requires exactly 1 byte.
 word8 :: Word8 -> Builder
-word8 w = fromBounded Nat.constant (Bounded.word8 w)
+word8 w = fromBoundedOne (Bounded.word8 w)
 
 -- | Prefix a builder with its size in bytes. This size is
 -- presented as a big-endian 32-bit word. The need to prefix
@@ -490,35 +453,54 @@ word8 w = fromBounded Nat.constant (Bounded.word8 w)
 -- However, using 'consLength32BE' is much more efficient here
 -- since it only materializes the 'ByteArray' once.
 consLength32BE :: Builder -> Builder
-consLength32BE (Builder f) = Builder $ \arr off len s0 -> case len >=# 4# of
-  1# -> case f arr (off +# 4# ) (len -# 4# ) s0 of
-    (# s1, r #) -> case r of
-      (-1#) -> (# s1, (-1#) #)
-      _ ->
-        let ST g = UnsafeBounded.pasteST
-              (Bounded.word32BE (fromIntegral ((I# r - I# off) - 4)))
-              (MutableByteArray arr)
-              (I# off)
-         in case g s1 of
-              (# s2, _ #) -> (# s2, r #)
-  _ -> (# s0, (-1#) #)
+consLength32BE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
+  let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 4# of
+        1# -> (# s0, buf0, off0, len0, cs0 #)
+        _ -> case Exts.newByteArray# 4080# s0 of
+          (# sX, bufX #) ->
+            (# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
+   in case f buf1 (off1 +# 4# ) (len1 -# 4# ) cs1 s1 of
+        (# s2, buf2, off2, len2, cs2 #) ->
+          let !dist = case Exts.sameMutableByteArray# buf1 buf2 of
+                1# -> off2 -# off1
+                _ -> commitDistance buf1 off2 cs2 -# off1
+              ST g = UnsafeBounded.pasteST
+                (Bounded.word32BE (fromIntegral (I# (dist -# 4# ))))
+                (MutableByteArray buf1)
+                (I# off1)
+           in case g s2 of
+                (# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #)
 
 
 -- | Prefix a builder with its size in bytes. This size is
 -- presented as a big-endian 64-bit word. See 'consLength32BE'.
 consLength64BE :: Builder -> Builder
-consLength64BE (Builder f) = Builder $ \arr off len s0 -> case len >=# 8# of
-  1# -> case f arr (off +# 8# ) (len -# 8# ) s0 of
-    (# s1, r #) -> case r of
-      (-1#) -> (# s1, (-1#) #)
-      _ ->
-        let ST g = UnsafeBounded.pasteST
-              (Bounded.word64BE (fromIntegral ((I# r - I# off) - 8)))
-              (MutableByteArray arr)
-              (I# off)
-         in case g s1 of
-              (# s2, _ #) -> (# s2, r #)
-  _ -> (# s0, (-1#) #)
+consLength64BE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 ->
+  let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 8# of
+        1# -> (# s0, buf0, off0, len0, cs0 #)
+        _ -> case Exts.newByteArray# 4080# s0 of
+          (# sX, bufX #) ->
+            (# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #)
+   in case f buf1 (off1 +# 8# ) (len1 -# 8# ) cs1 s1 of
+        (# s2, buf2, off2, len2, cs2 #) ->
+          let !dist = case Exts.sameMutableByteArray# buf1 buf2 of
+                1# -> off2 -# off1
+                _ -> commitDistance buf1 off2 cs2 -# off1
+              ST g = UnsafeBounded.pasteST
+                (Bounded.word64BE (fromIntegral (I# (dist -# 8# ))))
+                (MutableByteArray buf1)
+                (I# off1)
+           in case g s2 of
+                (# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #)
+
+commitDistance :: MutableByteArray# s -> Int# -> Commits s -> Int#
+commitDistance _ !_ Initial = error "chunkDistance: chunk not found"
+commitDistance target !n (Immutable _ _ len cs) =
+  commitDistance target (n +# len) cs
+commitDistance target !n (Mutable buf len cs) =
+  case Exts.sameMutableByteArray# target buf of
+    1# -> n +# len
+    _ -> commitDistance target (n +# len) cs
 
 -- ShortText is already UTF-8 encoded. This is a no-op.
 shortTextToByteArray :: ShortText -> ByteArray
diff --git a/src/Data/ByteArray/Builder/Unsafe.hs b/src/Data/ByteArray/Builder/Unsafe.hs
index 844cf83..6a193ee 100644
--- a/src/Data/ByteArray/Builder/Unsafe.hs
+++ b/src/Data/ByteArray/Builder/Unsafe.hs
@@ -9,6 +9,7 @@
 module Data.ByteArray.Builder.Unsafe
   ( -- * Types
     Builder(..)
+  , Commits(..)
     -- * Safe Functions
     -- | These functions are actually completely safe, but they are defined
     -- here because they are used by typeclass instances. Import them from
@@ -17,10 +18,10 @@ module Data.ByteArray.Builder.Unsafe
   , cstring
   ) where
 
-import Data.Primitive (MutableByteArray(MutableByteArray))
+import Data.Primitive (MutableByteArray(MutableByteArray),ByteArray)
 import Foreign.C.String (CString)
-import GHC.Exts ((-#),(+#),(/=#),(>#))
-import GHC.Exts (Addr#,Int(I#),Ptr(Ptr))
+import GHC.Exts ((-#),(+#),(>#))
+import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr))
 import GHC.Exts (IsString,Int#,State#,MutableByteArray#)
 import GHC.ST (ST(ST))
 import GHC.Base (unpackCString#,unpackCStringUtf8#)
@@ -32,10 +33,14 @@ import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
 -- | An unmaterialized sequence of bytes that may be pasted
 -- into a mutable byte array.
 newtype Builder
-  = Builder (forall s. MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #))
-    -- ^ This function takes a buffer, an offset, and a number of remaining bytes.
-    --   It returns the new offset (should be greater than the old offset), or if
-    --   there was not enough space left in buffer, it returns -1.
+  = Builder (forall s.
+      MutableByteArray# s ->   -- buffer we are currently writing to
+      Int# ->   -- offset into the current buffer
+      Int# ->   -- number of bytes remaining in the current buffer
+      Commits s ->   -- buffers and immutable byte slices that we have already committed
+      State# s ->
+      (# State# s, MutableByteArray# s, Int#, Int#, Commits s #) -- all the same things
+    )
 
 instance IsString Builder where
   {-# inline fromString #-}
@@ -43,53 +48,73 @@ instance IsString Builder where
 
 instance Semigroup Builder where
   {-# inline (<>) #-}
-  Builder f <> Builder g = Builder $ \arr off0 len0 s0 -> case f arr off0 len0 s0 of
-    (# s1, r #) -> case r /=# (-1#) of
-      1# -> g arr r (len0 +# (off0 -# r)) s1
-      _ -> (# s1, (-1#) #)
+  Builder f <> Builder g = Builder $ \buf0 off0 len0 cs0 s0 -> case f buf0 off0 len0 cs0 s0 of
+    (# s1, buf1, off1, len1, cs1 #) -> g buf1 off1 len1 cs1 s1
 
 instance Monoid Builder where
   {-# inline mempty #-}
-  mempty = Builder $ \_ off0 _ s0 -> (# s0, off0 #)
+  mempty = Builder $ \buf0 off0 len0 cs0 s0 -> (# s0, buf0, off0, len0, cs0 #)
+
+data Commits s
+  = Mutable
+      (MutableByteArray# s)
+      -- ^ Mutable buffer, start index implicitly zero
+      Int# -- ^ Length (may be smaller than actual length)
+      !(Commits s)
+  | Immutable
+      ByteArray# -- ^ Immutable chunk
+      Int# -- ^ Offset into chunk, not necessarily zero
+      Int# -- ^ Length (may be smaller than actual length)
+      !(Commits s)
+  | Initial
 
 -- | Create a builder from a cons-list of 'Char'. These
 -- are be UTF-8 encoded.
 stringUtf8 :: String -> Builder
 {-# inline stringUtf8 #-}
-stringUtf8 cs = Builder (\arr off0 len0 s0 -> goString cs arr off0 len0 s0)
+stringUtf8 cs = Builder (goString cs)
 
 -- | Create a builder from a @NUL@-terminated 'CString'. This ignores any
 -- textual encoding, copying bytes until @NUL@ is reached.
 cstring :: CString -> Builder
 {-# inline cstring #-}
-cstring (Ptr cs) = Builder (\arr off0 len0 s0 -> goCString cs arr off0 len0 s0)
+cstring (Ptr cs) = Builder (goCString cs)
 
-goString :: String -> MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+goString :: String
+  -> MutableByteArray# s -> Int# -> Int# -> Commits s
+  -> State# s -> (# State# s, MutableByteArray# s, Int#, Int#, Commits s #)
 {-# noinline goString #-}
-goString [] _ off0 _ s0 = (# s0, off0 #)
-goString (c : cs) buf off0 len0 s0 = case len0 ># 3# of
-  1# -> case unST (UnsafeBounded.pasteST (Bounded.char c) (MutableByteArray buf) (I# off0)) s0 of
-    (# s1, I# off1 #) -> goString cs buf off1 (len0 -# (off1 -# off0)) s1
-  _ -> (# s0, (-1#) #)
+goString [] buf0 off0 len0 cs0 s0 = (# s0, buf0, off0, len0, cs0 #)
+goString (c : cs) buf0 off0 len0 cs0 s0 = case len0 ># 3# of
+  1# -> case unST (UnsafeBounded.pasteST (Bounded.char c) (MutableByteArray buf0) (I# off0)) s0 of
+    (# s1, I# off1 #) -> goString cs buf0 off1 (len0 -# (off1 -# off0)) cs0 s1
+  _ -> case Exts.newByteArray# 4080# s0 of
+    (# s1, buf1 #) -> case unST (UnsafeBounded.pasteST (Bounded.char c) (MutableByteArray buf1) 0) s1 of
+      (# s2, I# off1 #) -> goString cs buf1 off1 (4080# -# off1) (Mutable buf0 off0 cs0) s2
 
 -- We have to have a rule for both unpackCString# and unpackCStringUtf8#
 -- since GHC uses a different function based on whether or not non-ASCII
 -- codepoints are used in the string.
 {-# RULES
-"Builder stringUtf8/cstring" forall s a b c d.
-  goString (unpackCString# s) a b c d = goCString s a b c d
-"Builder stringUtf8/cstring-utf8" forall s a b c d.
-  goString (unpackCStringUtf8# s) a b c d = goCString s a b c d
+"Builder stringUtf8/cstring" forall s a b c d e.
+  goString (unpackCString# s) a b c d e = goCString s a b c d e
+"Builder stringUtf8/cstring-utf8" forall s a b c d e.
+  goString (unpackCStringUtf8# s) a b c d e = goCString s a b c d e
 #-}
 
-goCString :: Addr# -> MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
-goCString addr buf off0 len0 s0 = case Exts.indexWord8OffAddr# addr 0# of
-  0## -> (# s0, off0 #)
+goCString :: Addr# -> MutableByteArray# s -> Int# -> Int# -> Commits s
+  -> State# s -> (# State# s, MutableByteArray# s, Int#, Int#, Commits s #)
+goCString addr buf0 off0 len0 cs0 s0 = case Exts.indexWord8OffAddr# addr 0# of
+  0## -> (# s0, buf0, off0, len0, cs0 #)
   w -> case len0 of
-    0# -> (# s0, (-1#) #)
-    _ -> case Exts.writeWord8Array# buf off0 w s0 of
-      s1 -> goCString (Exts.plusAddr# addr 1# ) buf (off0 +# 1# ) (len0 -# 1# ) s1
+    0# -> case Exts.newByteArray# 4080# s0 of
+      (# s1, buf1 #) -> case Exts.writeWord8Array# buf1 0# w s1 of
+        s2 -> goCString
+          (Exts.plusAddr# addr 1# ) buf1 1# (4080# -# 1# )
+          (Mutable buf0 off0 cs0)
+          s2
+    _ -> case Exts.writeWord8Array# buf0 off0 w s0 of
+      s1 -> goCString (Exts.plusAddr# addr 1# ) buf0 (off0 +# 1# ) (len0 -# 1# ) cs0 s1
 
 unST :: ST s a -> State# s -> (# State# s, a #)
 unST (ST f) = f
-
diff --git a/src/Data/Bytes/Chunks.hs b/src/Data/Bytes/Chunks.hs
new file mode 100644
index 0000000..983b541
--- /dev/null
+++ b/src/Data/Bytes/Chunks.hs
@@ -0,0 +1,76 @@
+{-# language BangPatterns #-}
+{-# language DerivingStrategies #-}
+{-# language TypeFamilies #-}
+{-# language MagicHash #-}
+{-# language UnboxedTuples #-}
+{-# language NamedFieldPuns #-}
+
+module Data.Bytes.Chunks
+  ( Chunks(..)
+  , concat
+  ) where
+
+import Prelude hiding (length,concat)
+
+import GHC.ST (ST(..))
+import Data.Bytes.Types (Bytes(..))
+import Data.Primitive (ByteArray(..),MutableByteArray(..))
+import GHC.Exts (ByteArray#,MutableByteArray#)
+import GHC.Exts (IsList,Int#,State#,Int(I#),(+#),(-#))
+import Control.Monad.ST.Run (runByteArrayST)
+
+import qualified GHC.Exts as Exts
+import qualified Data.Primitive as PM
+
+data Chunks
+  = ChunksCons {-# UNPACK #-} !Bytes !Chunks
+  | ChunksNil
+
+concat :: Chunks -> ByteArray
+concat x = ByteArray (concat# x)
+
+concat# :: Chunks -> ByteArray#
+{-# noinline concat# #-}
+concat# ChunksNil = case mempty of {ByteArray x -> x}
+concat# (ChunksCons (Bytes{array=c,offset=coff,length=szc}) cs) = case cs of
+  ChunksNil -> case c of {ByteArray x -> x}
+  ChunksCons (Bytes{array=d,offset=doff,length=szd}) ds ->
+    unBa $ runByteArrayST $ do
+      let szboth = szc + szd
+          len = chunksLengthGo szboth ds
+      dst <- PM.newByteArray len
+      PM.copyByteArray dst 0 c coff szc
+      PM.copyByteArray dst szc d doff szd
+      _ <- copy dst szboth ds
+      PM.unsafeFreezeByteArray dst
+
+chunksLengthGo :: Int -> Chunks -> Int
+chunksLengthGo !n ChunksNil = n
+chunksLengthGo !n (ChunksCons (Bytes{length}) cs) =
+  chunksLengthGo (n + length) cs
+
+-- | Copy the contents of the chunks into a mutable array.
+-- Precondition: The destination must have enough space to
+-- house the contents. This is not checked.
+copy ::
+     MutableByteArray s -- ^ Destination
+  -> Int -- ^ Destination offset
+  -> Chunks -- ^ Source
+  -> ST s Int -- ^ Returns the next index into the destination after the payload
+{-# inline copy #-}
+copy (MutableByteArray dst) (I# off) cs = ST
+  (\s0 -> case copy# dst off cs s0 of
+    (# s1, nextOff #) -> (# s1, I# nextOff #)
+  )
+
+copy# :: MutableByteArray# s -> Int# -> Chunks -> State# s -> (# State# s, Int# #)
+copy# _ off ChunksNil s0 = (# s0, off #)
+copy# marr off (ChunksCons (Bytes{array,offset,length}) cs) s0 =
+  case Exts.copyByteArray# (unBa array) (unI offset) marr off (unI length) s0 of
+    s1 -> copy# marr (off +# unI length) cs s1
+
+unI :: Int -> Int#
+unI (I# i) = i
+
+unBa :: ByteArray -> ByteArray#
+unBa (ByteArray x) = x
diff --git a/test/Main.hs b/test/Main.hs
index ec602b9..9e8001d 100644
--- a/test/Main.hs
+++ b/test/Main.hs
@@ -18,6 +18,7 @@ import qualified Arithmetic.Nat as Nat
 import qualified Data.ByteString as ByteString
 import qualified Data.ByteString.Builder as BB
 import qualified Data.ByteString.Lazy.Char8 as LB
+import qualified Data.Bytes.Chunks as Chunks
 import qualified Data.List as L
 import qualified Data.Primitive as PM
 import qualified Data.Text as T
@@ -37,79 +38,99 @@ tests :: TestTree
 tests = testGroup "Tests"
   [ testGroup "live"
     [ TQC.testProperty "word64Dec" $ \w ->
-        run 1 (word64Dec w) === pack (show w)
+        runConcat 1 (word64Dec w) === pack (show w)
     , TQC.testProperty "word64Dec-x3" $ \x y z ->
-        run 1 (word64Dec x <> word64Dec y <> word64Dec z)
+        runConcat 1 (word64Dec x <> word64Dec y <> word64Dec z)
         ===
         pack (show x ++ show y ++ show z)
     , TQC.testProperty "int64Dec-x3" $ \x y z ->
-        run 1 (int64Dec x <> int64Dec y <> int64Dec z)
+        runConcat 1 (int64Dec x <> int64Dec y <> int64Dec z)
         ===
         pack (show x ++ show y ++ show z)
     , TQC.testProperty "word64BE-x3" $ \x y z ->
-        run 1 (word64BE x <> word64BE y <> word64BE z)
+        runConcat 1 (word64BE x <> word64BE y <> word64BE z)
         ===
         pack (LB.unpack (BB.toLazyByteString (BB.word64BE x <> BB.word64BE y <> BB.word64BE z)))
     , TQC.testProperty "word64PaddedUpperHex" $ \w ->
-        run 1 (word64PaddedUpperHex w)
+        runConcat 1 (word64PaddedUpperHex w)
         ===
         pack (showWord64PaddedUpperHex w)
     , TQC.testProperty "word8Dec" $ \w ->
-        run 1 (word8Dec w)
+        runConcat 1 (word8Dec w)
         ===
         pack (show w)
     , TQC.testProperty "consLength32BE" $ \w ->
-        run 1 (consLength32BE (word8Dec w))
+        runConcat 1 (consLength32BE (word8Dec w))
         ===
         pack ('\x00' : '\x00' : '\x00' : chr (L.length (show w)) : show w)
     , TQC.testProperty "consLength64BE" $ \w ->
-        run 1 (consLength64BE (word16Dec w))
+        pack
+          ( '\x00' : '\x00' : '\x00' : '\x00'
+          : '\x00' : '\x00' : '\x00' : chr (L.length (show w))
+          : show w
+          )
         ===
-        pack ('\x00' : '\x00' : '\x00' : '\x00' : '\x00' : '\x00' : '\x00' : chr (L.length (show w)) : show w)
-    , TQC.testProperty "pasteArrayST" $ \(xs :: [Word64]) ->
-        (runArray word64Dec (V.fromList xs))
-        ===
-        pack (foldMap show xs)
+        runConcat 1 (consLength64BE (word16Dec w))
     , THU.testCase "stringUtf8" $
         packUtf8 "¿Cómo estás? I am doing well." @=?
-          run 1 (stringUtf8 "¿Cómo estás? I am doing well.")
+          runConcat 1 (stringUtf8 "¿Cómo estás? I am doing well.")
     , THU.testCase "doubleDec-A" $
-        pack (show (2 :: Int)) @=? run 1 (doubleDec 2.0)
+        pack (show (2 :: Int)) @=? runConcat 1 (doubleDec 2.0)
     , THU.testCase "doubleDec-B" $
-        pack (show (2.5 :: Double)) @=? run 1 (doubleDec 2.5)
+        pack (show (2.5 :: Double)) @=? runConcat 1 (doubleDec 2.5)
     , THU.testCase "doubleDec-C" $
-        pack ("1e+15") @=? run 1 (doubleDec 1e15)
+        pack ("1e+15") @=? runConcat 1 (doubleDec 1e15)
     , THU.testCase "doubleDec-D" $
-        pack ("-42") @=? run 1 (doubleDec (-42))
+        pack ("-42") @=? runConcat 1 (doubleDec (-42))
     , THU.testCase "doubleDec-E" $
-        pack ("-8.88888888888888e+14") @=? run 1 (doubleDec (-888888888888888.8888888))
+        pack ("-8.88888888888888e+14") @=? runConcat 1 (doubleDec (-888888888888888.8888888))
     , THU.testCase "doubleDec-F" $
-        pack ("42") @=? run 1 (doubleDec 42)
+        pack ("42") @=? runConcat 1 (doubleDec 42)
     , THU.testCase "doubleDec-G" $
-        pack ("0") @=? run 1 (doubleDec 0)
+        pack ("0") @=? runConcat 1 (doubleDec 0)
     , THU.testCase "doubleDec-H" $
-        pack ("0.5") @=? run 1 (doubleDec 0.5)
+        pack ("0.5") @=? runConcat 1 (doubleDec 0.5)
     , THU.testCase "doubleDec-I" $
-        pack ("-0.5") @=? run 1 (doubleDec (-0.5))
+        pack ("-0.5") @=? runConcat 1 (doubleDec (-0.5))
     , THU.testCase "doubleDec-J" $
-        pack ("999999999") @=? run 1 (doubleDec 999999999)
+        pack ("999999999") @=? runConcat 1 (doubleDec 999999999)
     , THU.testCase "doubleDec-K" $
-        pack ("-99999999") @=? run 1 (doubleDec (-99999999))
+        pack ("-99999999") @=? runConcat 1 (doubleDec (-99999999))
     , THU.testCase "shortTextJsonString-A" $
-        pack ("\"hello\"") @=? run 1 (shortTextJsonString "hello")
+        pack ("\"hello\"") @=? runConcat 1 (shortTextJsonString "hello")
     , THU.testCase "shortTextJsonString-B" $
-        pack ("\"\\\\_\\\"_/\"") @=? run 1 (shortTextJsonString "\\_\"_/")
+        pack ("\"\\\\_\\\"_/\"") @=? runConcat 1 (shortTextJsonString "\\_\"_/")
     , THU.testCase "shortTextJsonString-C" $
-        pack ("\"Hi\\r\\nLo\"") @=? run 1 (shortTextJsonString "Hi\r\nLo")
+        pack ("\"Hi\\r\\nLo\"") @=? runConcat 1 (shortTextJsonString "Hi\r\nLo")
     , THU.testCase "shortTextJsonString-D" $
-        pack ("\"Hi\\u001BLo\"") @=? run 1 (shortTextJsonString "Hi\ESCLo")
+        pack ("\"Hi\\u001BLo\"") @=? runConcat 1 (shortTextJsonString "Hi\ESCLo")
     , THU.testCase "word-16-tree" $
-        Word16Tree.expectedSmall @=? run 1
+        Word16Tree.expectedSmall @=? runConcat 1
           (Word16Tree.encode Word16Tree.exampleSmall)
+    , THU.testCase "byteArray-small" $
+        let a = replicateByte 3 0x50
+            b = replicateByte 5 0x51
+         in mconcat [a,b] @=? runConcat 1
+              ( byteArray a <> byteArray b )
+    , THU.testCase "byteArray-big" $
+        let a = replicateByte 2105 0x50
+            b = replicateByte 725 0x51
+            c = replicateByte 900 0x52
+            d = replicateByte 800 0x53
+            e = replicateByte 700 0x54
+            f = replicateByte 950 0x55
+            g = replicateByte 975 0x56
+            h = replicateByte 3000 0x57
+            i = replicateByte 125 0x58
+         in mconcat [a,b,c,d,e,f,g,h,i] @=? runConcat 1
+              ( byteArray a <> byteArray b <> byteArray c <>
+                byteArray d <> byteArray e <> byteArray f <>
+                byteArray g <> byteArray h <> byteArray i
+              )
     ]
   , testGroup "alternate"
     [ TQC.testProperty "HexWord64" $ \x y ->
-        run 1
+        runConcat 1
           (  fromBounded Nat.constant (HexWord64.word64PaddedUpperHex x)
           <> fromBounded Nat.constant (HexWord64.word64PaddedUpperHex y)
           )
@@ -118,28 +139,20 @@ tests = testGroup "Tests"
     ]
   ]
 
+replicateByte :: Int -> Word8 -> ByteArray
+replicateByte n w = runST $ do
+  m <- PM.newByteArray n
+  PM.setByteArray m 0 n w
+  PM.unsafeFreezeByteArray m
+
 pack :: String -> ByteArray
 pack = Exts.fromList . map (fromIntegral @Int @Word8 . ord)
 
 packUtf8 :: String -> ByteArray
 packUtf8 = Exts.fromList . ByteString.unpack . TE.encodeUtf8 . T.pack
 
--- This is used to test pasteArrayST
-runArray ::
-     (a -> Builder) -- ^ Builder
-  -> V.Vector a -- ^ Elements to serialize
-  -> ByteArray -- ^ Number of elements serialized, serialization
-runArray f !xs = runST $ do
-  let go !v0 !sz !chunks = if V.null v0
-        then pure (mconcat (L.reverse chunks))
-        else do
-          arr <- PM.newByteArray sz
-          (v1,MutableBytes _ off _) <- pasteArrayST (MutableBytes arr 0 sz) f v0
-          -- If nothing was serialized, we need a bigger buffer
-          let szNext = if V.length v0 == V.length v1 then sz + 1 else sz
-          c <- PM.unsafeFreezeByteArray =<< PM.resizeMutableByteArray arr off
-          go v1 szNext (c : chunks)
-  go xs 1 []
-
 showWord64PaddedUpperHex :: Word64 -> String
 showWord64PaddedUpperHex = printf "%016X" 
+
+runConcat :: Int -> Builder -> ByteArray
+runConcat n = Chunks.concat . run n