From c7fcaff97f1af30b938be13a542cefdf75376a40 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Fri, 22 Nov 2019 16:24:48 -0500 Subject: [PATCH] Add putMany and considate consLength functions --- CHANGELOG.md | 6 ++ small-bytearray-builder.cabal | 4 +- src/Data/ByteArray/Builder.hs | 145 ++++++++++++++++++--------- src/Data/ByteArray/Builder/Unsafe.hs | 28 +++++- test/Main.hs | 37 +++++++ 5 files changed, 173 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cb0632..d286d22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Revision history for small-bytearray-builder +## 0.3.2.0 -- 2019-??-?? + +* Add `putMany`, which allows pasting into the same mutable byte + array over and over. +* Add `consLength`. + ## 0.3.1.0 -- 2019-11-20 * Add big-endian and little-endian parsers for `Word128`. This includes diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal index 7ee0f9b..fcb0c52 100644 --- a/small-bytearray-builder.cabal +++ b/small-bytearray-builder.cabal @@ -1,6 +1,6 @@ cabal-version: 2.2 name: small-bytearray-builder -version: 0.3.1.0 +version: 0.3.2.0 synopsis: Serialize to a small byte arrays description: This is similar to the builder facilities provided by @@ -47,6 +47,7 @@ library , bytestring >=0.10.8.2 && <0.11 , natural-arithmetic >=0.1 && <0.2 , primitive-offset >=0.2 && <0.3 + , primitive-unlifted >=0.1.2 && <0.2 , run-st >=0.1 && <0.2 , text-short >=0.1.3 && <0.2 , wide-word >=0.1.0.9 && <0.2 @@ -74,6 +75,7 @@ test-suite test , bytestring , natural-arithmetic , primitive + , primitive-unlifted >=0.1.2 , quickcheck-classes >=0.6.4 , small-bytearray-builder , tasty >=1.2.3 && <1.3 diff --git a/src/Data/ByteArray/Builder.hs b/src/Data/ByteArray/Builder.hs index b951442..4120569 100644 --- a/src/Data/ByteArray/Builder.hs +++ b/src/Data/ByteArray/Builder.hs @@ -13,6 +13,7 @@ module Data.ByteArray.Builder , fromBounded -- * Evaluation , run + , putMany -- * Materialized Byte Sequences , bytes , copy @@ -88,6 +89,7 @@ module Data.ByteArray.Builder , int32ArrayLE , int16ArrayLE -- ** Prefixing with Length + , consLength , consLength32LE , consLength32BE , consLength64BE @@ -100,6 +102,7 @@ module Data.ByteArray.Builder import Control.Monad.ST (ST,runST) import Data.ByteArray.Builder.Unsafe (Builder(Builder)) +import Data.ByteArray.Builder.Unsafe (BuilderState(BuilderState),pasteIO) import Data.ByteArray.Builder.Unsafe (Commits(Initial,Mutable,Immutable)) import Data.ByteArray.Builder.Unsafe (reverseCommitsOntoChunks) import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring) @@ -107,14 +110,17 @@ import Data.ByteString.Short.Internal (ShortByteString(SBS)) import Data.Bytes.Chunks (Chunks(ChunksNil)) import Data.Bytes.Types (Bytes(Bytes)) import Data.Char (ord) +import Data.Foldable (foldlM) import Data.Int (Int64,Int32,Int16,Int8) import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..)) +import Data.Primitive.Unlifted.Array (MutableUnliftedArray,UnliftedArray) import Data.Text.Short (ShortText) import Data.WideWord (Word128) import Data.Word (Word64,Word32,Word16,Word8) import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder) import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#)) -import GHC.Exts (MutableByteArray#,(+#),(-#),(<#)) +import GHC.Exts (RealWorld,MutableByteArray#,(+#),(-#),(<#)) +import GHC.IO (IO(IO)) import GHC.ST (ST(ST)) import qualified Arithmetic.Nat as Nat @@ -122,6 +128,7 @@ import qualified Arithmetic.Types as Arithmetic import qualified Data.ByteArray.Builder.Bounded as Bounded import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded import qualified Data.Primitive as PM +import qualified Data.Primitive.Unlifted.Array as PM import qualified Data.Text.Short as TS import qualified GHC.Exts as Exts @@ -137,6 +144,73 @@ run hint@(I# hint# ) (Builder f) = runST $ do (# s1, Mutable bufX offX csX #) reverseCommitsOntoChunks ChunksNil cs +-- | Run a builder against lots of elements. This fills the same +-- underlying buffer over and over again. Do not let the argument to +-- the callback escape from the callback (i.e. do not write it to an +-- @IORef@). Also, do not @unsafeFreezeByteArray@ any of the mutable +-- byte arrays in the callback. The intent is that the callback will +-- write the buffers out, preferably using vectored I/O. +putMany :: Foldable f + => Int -- ^ Size of shared chunk (use 8176 if uncertain) + -> (a -> Builder) -- ^ Value builder + -> f a -- ^ Collection of values + -> (UnliftedArray (MutableByteArray RealWorld) -> IO b) -- ^ Consume chunks. + -> IO () +{-# inline putMany #-} +putMany hint@(I# hint#) g xs cb = do + MutableByteArray buf0 <- PM.newByteArray hint + BuilderState bufZ offZ _ cmtsZ <- foldlM + (\st0 a -> do + st1@(BuilderState buf off _ cmts) <- pasteIO (g a) st0 + case cmts of + Initial -> pure st1 + _ -> do + _ <- cb =<< commitsToArray buf off cmts + pure (BuilderState buf0 0# hint# Initial) + ) (BuilderState buf0 0# hint# Initial) xs + _ <- cb =<< commitsToArray bufZ offZ cmtsZ + pure () + +commitsToArray :: + MutableByteArray# RealWorld -- final chunk to append to commits + -> Int# -- offset + -> Commits RealWorld + -> IO (UnliftedArray (MutableByteArray RealWorld)) +commitsToArray buf off cmts = do + let ct = countCommits 1 cmts + bufs <- PM.unsafeNewUnliftedArray ct + -- Only shrink the last chunk. Crucially, this is never the first + -- chunk (except on the commitsToArray call at the end of folding + -- over the collection). We only perform this shrink in the hopes + -- that a future GHC will allow reclaiming bytes from shrunk arrays. + shrinkMutableByteArray (MutableByteArray buf) (I# off) + PM.writeUnliftedArray bufs (ct - 1) (MutableByteArray buf) + writeCommitsToArray (ct - 2) bufs cmts + PM.unsafeFreezeUnliftedArray bufs + +-- See the documentation for putMany. +writeCommitsToArray :: + Int + -> MutableUnliftedArray RealWorld (MutableByteArray RealWorld) + -> Commits RealWorld + -> IO () +writeCommitsToArray !ix !arrs x0 = case x0 of + Initial -> pure () + Mutable buf _ x1 -> do + PM.writeUnliftedArray arrs ix (MutableByteArray buf) + writeCommitsToArray (ix - 1) arrs x1 + Immutable arr off len x1 -> do + buf <- PM.newByteArray (I# len) + PM.copyByteArray buf 0 (ByteArray arr) (I# off) (I# len) + PM.writeUnliftedArray arrs ix buf + writeCommitsToArray (ix - 1) arrs x1 + +countCommits :: Int -> Commits s -> Int +countCommits !n x0 = case x0 of + Initial -> n + Mutable _ _ x1 -> countCommits (n + 1) x1 + Immutable _ _ _ x1 -> countCommits (n + 1) x1 + -- | Convert a bounded builder to an unbounded one. If the size -- is a constant, use @Arithmetic.Nat.constant@ as the first argument -- to let GHC conjure up this value for you. @@ -603,7 +677,7 @@ word8PaddedUpperHex w = ascii :: Char -> Builder ascii c = fromBoundedOne (Bounded.ascii c) --- | Encode an UTF8 char. This only uses as much space as is required. +-- | Encode a UTF-8 char. This only uses as much space as is required. char :: Char -> Builder char c = fromBounded Nat.constant (Bounded.char c) @@ -684,33 +758,43 @@ word16BE w = fromBounded Nat.constant (Bounded.word16BE w) word8 :: Word8 -> Builder word8 w = fromBoundedOne (Bounded.word8 w) --- | Variant of 'consLength32BE' the encodes the length in --- a little-endian fashion. -consLength32LE :: Builder -> Builder -consLength32LE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 -> - let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 4# of +-- | Prefix a builder with the number of bytes that it requires. +consLength :: + Arithmetic.Nat n -- ^ Number of bytes used by the serialization of the length + -> (Int -> Bounded.Builder n) -- ^ Length serialization function + -> Builder -- ^ Builder whose length is measured + -> Builder +{-# inline consLength #-} +consLength !n buildSize (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 -> + let !(I# lenSz) = Nat.demote n + !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# lenSz of 1# -> (# s0, buf0, off0, len0, cs0 #) _ -> case Exts.newByteArray# 4080# s0 of (# sX, bufX #) -> (# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #) - in case f buf1 (off1 +# 4# ) (len1 -# 4# ) cs1 s1 of + in case f buf1 (off1 +# lenSz) (len1 -# lenSz) cs1 s1 of (# s2, buf2, off2, len2, cs2 #) -> let !dist = case Exts.sameMutableByteArray# buf1 buf2 of 1# -> off2 -# off1 _ -> commitDistance buf1 off2 cs2 -# off1 ST g = UnsafeBounded.pasteST - (Bounded.word32LE (fromIntegral (I# (dist -# 4# )))) + (buildSize (fromIntegral (I# (dist -# lenSz)))) (MutableByteArray buf1) (I# off1) in case g s2 of (# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #) +-- | Variant of 'consLength32BE' the encodes the length in +-- a little-endian fashion. +consLength32LE :: Builder -> Builder +consLength32LE = consLength Nat.constant (\x -> Bounded.word32LE (fromIntegral x)) + -- | Prefix a builder with its size in bytes. This size is -- presented as a big-endian 32-bit word. The need to prefix -- a builder with its length shows up a numbers of wire protocols -- including those of PostgreSQL and Apache Kafka. Note the -- equivalence: --- +-- -- > forall (n :: Int) (x :: Builder). -- > let sz = sizeofByteArray (run n (consLength32BE x)) -- > consLength32BE x === word32BE (fromIntegral sz) <> x @@ -718,45 +802,12 @@ consLength32LE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 -> -- However, using 'consLength32BE' is much more efficient here -- since it only materializes the 'ByteArray' once. consLength32BE :: Builder -> Builder -consLength32BE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 -> - let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 4# of - 1# -> (# s0, buf0, off0, len0, cs0 #) - _ -> case Exts.newByteArray# 4080# s0 of - (# sX, bufX #) -> - (# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #) - in case f buf1 (off1 +# 4# ) (len1 -# 4# ) cs1 s1 of - (# s2, buf2, off2, len2, cs2 #) -> - let !dist = case Exts.sameMutableByteArray# buf1 buf2 of - 1# -> off2 -# off1 - _ -> commitDistance buf1 off2 cs2 -# off1 - ST g = UnsafeBounded.pasteST - (Bounded.word32BE (fromIntegral (I# (dist -# 4# )))) - (MutableByteArray buf1) - (I# off1) - in case g s2 of - (# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #) - +consLength32BE = consLength Nat.constant (\x -> Bounded.word32BE (fromIntegral x)) -- | Prefix a builder with its size in bytes. This size is -- presented as a big-endian 64-bit word. See 'consLength32BE'. consLength64BE :: Builder -> Builder -consLength64BE (Builder f) = Builder $ \buf0 off0 len0 cs0 s0 -> - let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# 8# of - 1# -> (# s0, buf0, off0, len0, cs0 #) - _ -> case Exts.newByteArray# 4080# s0 of - (# sX, bufX #) -> - (# sX, bufX, 0#, 4080#, Mutable buf0 off0 cs0 #) - in case f buf1 (off1 +# 8# ) (len1 -# 8# ) cs1 s1 of - (# s2, buf2, off2, len2, cs2 #) -> - let !dist = case Exts.sameMutableByteArray# buf1 buf2 of - 1# -> off2 -# off1 - _ -> commitDistance buf1 off2 cs2 -# off1 - ST g = UnsafeBounded.pasteST - (Bounded.word64BE (fromIntegral (I# (dist -# 8# )))) - (MutableByteArray buf1) - (I# off1) - in case g s2 of - (# s3, _ #) -> (# s3, buf2, off2, len2, cs2 #) +consLength64BE = consLength Nat.constant (\x -> Bounded.word64BE (fromIntegral x)) commitDistance :: MutableByteArray# s -> Int# -> Commits s -> Int# commitDistance _ !_ Initial = error "chunkDistance: chunk not found" @@ -796,3 +847,7 @@ indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i) c2w :: Char -> Word8 c2w = fromIntegral . ord + +shrinkMutableByteArray :: MutableByteArray RealWorld -> Int -> IO () +shrinkMutableByteArray (MutableByteArray x) (I# i) = + IO (\s -> (# Exts.shrinkMutableByteArray# x i s, ()#)) diff --git a/src/Data/ByteArray/Builder/Unsafe.hs b/src/Data/ByteArray/Builder/Unsafe.hs index c4e8640..2cc3716 100644 --- a/src/Data/ByteArray/Builder/Unsafe.hs +++ b/src/Data/ByteArray/Builder/Unsafe.hs @@ -9,7 +9,11 @@ module Data.ByteArray.Builder.Unsafe ( -- * Types Builder(..) + , BuilderState(..) , Commits(..) + -- * Execution + , pasteST + , pasteIO -- * Construction , fromEffect -- * Finalization @@ -30,8 +34,9 @@ import Foreign.C.String (CString) import GHC.Base (unpackCString#,unpackCStringUtf8#) import GHC.Exts ((-#),(+#),(>#),(>=#)) import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr)) -import GHC.Exts (IsString,Int#,State#) +import GHC.Exts (RealWorld,IsString,Int#,State#) import GHC.ST (ST(ST)) +import GHC.IO (stToIO) import qualified Data.ByteArray.Builder.Bounded as Bounded import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded @@ -50,6 +55,27 @@ newtype Builder (# State# s, MutableByteArray# s, Int#, Int#, Commits s #) -- all the same things ) +data BuilderState s = BuilderState + (MutableByteArray# s) -- buffer we are currently writing to + Int# -- offset into the current buffer + Int# -- number of bytes remaining in the current buffer + !(Commits s) -- buffers and immutable byte slices that are already committed + +-- | Run a builder, performing an in-place update on the state. +-- The @BuilderState@ argument must not be reused after being passed +-- to this function. That is, its use must be affine. +pasteST :: Builder -> BuilderState s -> ST s (BuilderState s) +{-# inline pasteST #-} +pasteST (Builder f) (BuilderState buf off len cmts) = ST $ \s0 -> + case f buf off len cmts s0 of + (# s1, buf1, off1, len1, cmts1 #) -> + (# s1, BuilderState buf1 off1 len1 cmts1 #) + +-- | Variant of 'pasteST' that runs in 'IO'. +pasteIO :: Builder -> BuilderState RealWorld -> IO (BuilderState RealWorld) +{-# inline pasteIO #-} +pasteIO b st = stToIO (pasteST b st) + instance IsString Builder where {-# inline fromString #-} fromString = stringUtf8 diff --git a/test/Main.hs b/test/Main.hs index 0c23bb8..2a6eb99 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -13,6 +13,7 @@ import Data.Bytes.Chunks (Chunks(ChunksNil,ChunksCons)) import Data.Primitive (PrimArray) import Data.Word import Data.Char (ord,chr) +import Data.IORef (IORef,newIORef,readIORef,writeIORef) import Data.Primitive (ByteArray) import Data.Proxy (Proxy(..)) import Data.WideWord (Word128(Word128)) @@ -28,6 +29,7 @@ import qualified Data.ByteString.Lazy.Char8 as LB import qualified Data.Bytes.Chunks as Chunks import qualified Data.List as L import qualified Data.Primitive as PM +import qualified Data.Primitive.Unlifted.Array as PM import qualified Data.Text as T import qualified Data.Text.Encoding as TE import qualified GHC.Exts as Exts @@ -197,8 +199,40 @@ tests = testGroup "Tests" , lawsToTest (QCC.semigroupLaws (Proxy :: Proxy Chunks)) , lawsToTest (QCC.monoidLaws (Proxy :: Proxy Chunks)) ] + , testGroup "putMany" + [ THU.testCase "A" $ do + ref <- newIORef [] + let txt = "hello_world_are_you_listening" :: [Char] + putMany 7 ascii txt (ontoRef ref) + res <- readIORef ref + id $ + [ map c2w "hello_w" + , map c2w "o" + , map c2w "rld_are" + , map c2w "_" + , map c2w "you_lis" + , map c2w "t" + , map c2w "ening" + ] @=? map Exts.toList (Exts.toList res) + ] ] +ontoRef :: + IORef [PM.ByteArray] + -> PM.UnliftedArray (PM.MutableByteArray Exts.RealWorld) + -> IO () +ontoRef !ref xs = do + rs <- readIORef ref + ps <- PM.foldlUnliftedArrayM' + (\ys buf -> do + len <- PM.getSizeofMutableByteArray buf + dst <- PM.newByteArray len + PM.copyMutableByteArray dst 0 buf 0 len + dst' <- PM.unsafeFreezeByteArray dst + pure (ys ++ [dst']) + ) [] xs + writeIORef ref (rs ++ ps) + instance Arbitrary Chunks where arbitrary = do xs :: [[Word8]] <- TQC.arbitrary @@ -232,5 +266,8 @@ showWord64PaddedUpperHex = printf "%016X" runConcat :: Int -> Builder -> ByteArray runConcat n = Chunks.concat . run n +c2w :: Char -> Word8 +c2w = fromIntegral . ord + instance Arbitrary Word128 where arbitrary = liftA2 Word128 TQC.arbitrary TQC.arbitrary