From 831c25c81ae7ec1c738eb368fdf51de4e25c0813 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Tue, 25 Jun 2019 15:18:34 -0400 Subject: [PATCH] add more features --- small-bytearray-builder.cabal | 42 +++++- src/Data/ByteArray/Builder/Small.hs | 127 ++++++++++------ src/Data/ByteArray/Builder/Small/Unsafe.hs | 165 +++++++++++++++++++++ test/Main.hs | 36 +++++ 4 files changed, 322 insertions(+), 48 deletions(-) create mode 100644 src/Data/ByteArray/Builder/Small/Unsafe.hs diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal index 4da3e10..e3e22dd 100644 --- a/small-bytearray-builder.cabal +++ b/small-bytearray-builder.cabal @@ -1,10 +1,26 @@ -cabal-version: 2.2 +cabal-version: 2.2 name: small-bytearray-builder version: 0.1.0.0 --- synopsis: --- description: +synopsis: Serialize to a small byte arrays +description: + This is similar to the builder facilities provided by + `Data.ByteString.Builder`. It is intended to be used in + situations where the following apply: + . + * An individual entity will be serialized as a small + number of bytes (less than 512). + . + * A large number (more than 32) of entities will be serialized + one after another without anything between them. + . + Unlike builders from the `bytestring` package, these builders + do not track their state when they run out of space. A builder + that runs out of space simply aborts and is rerun at the beginning + of the next chunk. This strategy for building is suitable for most + CSVs and several line protocols (carbon, InfluxDB, etc.). + homepage: https://github.com/andrewthad/small-bytearray-builder --- bug-reports: +bug-reports: https://github.com/andrewthad/small-bytearray-builder/issues license: BSD-3-Clause license-file: LICENSE author: Andrew Martin @@ -13,16 +29,28 @@ copyright: 2019 Andrew Martin category: Data extra-source-files: CHANGELOG.md +flag checked + manual: True + description: Add bounds-checking to primitive array operations + default: False + library exposed-modules: Data.ByteArray.Builder.Small + Data.ByteArray.Builder.Small.Unsafe -- other-modules: -- other-extensions: build-depends: base >=4.12.0.0 && <5 - , primitive >=0.7 && <0.8 , byteslice >=0.1 && <0.2 - ghc-options: -Wall -O2 -ddump-to-file -ddump-simpl + , primitive-offset >=0.2 && <0.3 + , run-st >=0.1 && <0.2 + , vector >=0.12.0.3 && <0.13 + if flag(checked) + build-depends: primitive-checked >= 0.7 && <0.8 + else + build-depends: primitive >= 0.7 && <0.8 + ghc-options: -Wall -O2 -ddump-to-file -ddump-simpl -dsuppress-all hs-source-dirs: src default-language: Haskell2010 @@ -33,8 +61,10 @@ test-suite test main-is: Main.hs build-depends: base >=4.12.0.0 && <5 + , byteslice , small-bytearray-builder , QuickCheck >=2.13.1 && <2.14 , tasty-quickcheck >=0.10.1 && <0.11 , tasty >=1.2.3 && <1.3 , primitive + , vector diff --git a/src/Data/ByteArray/Builder/Small.hs b/src/Data/ByteArray/Builder/Small.hs index c1f71b0..b68f4d0 100644 --- a/src/Data/ByteArray/Builder/Small.hs +++ b/src/Data/ByteArray/Builder/Small.hs @@ -1,9 +1,10 @@ -{-# language ScopedTypeVariables #-} {-# language BangPatterns #-} -{-# language MagicHash #-} -{-# language UnboxedTuples #-} -{-# language RankNTypes #-} +{-# language DuplicateRecordFields #-} {-# language LambdaCase #-} +{-# language MagicHash #-} +{-# language RankNTypes #-} +{-# language ScopedTypeVariables #-} +{-# language UnboxedTuples #-} module Data.ByteArray.Builder.Small ( -- * Unsafe Primitives @@ -13,21 +14,32 @@ module Data.ByteArray.Builder.Small , run , pasteST , pasteIO + , pasteGrowST + , pasteGrowIO + , pasteArrayST + , pasteArrayIO -- * Materialized Byte Sequences , bytes , bytearray -- * Numbers , word64Dec + , word64PaddedUpperHex ) where import Control.Monad.Primitive import Control.Monad.ST +import Control.Monad.ST.Run (runByteArrayST) import Data.Bytes.Types -import Data.Char (ord) import Data.Primitive import GHC.Exts import GHC.ST import GHC.Word +import GHC.TypeLits (KnownNat,natVal') +import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) + +import qualified Data.Primitive as PM +import qualified Data.Vector as V +import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe -- | An unmaterialized sequence of bytes that may be pasted -- into a mutable byte array. @@ -41,12 +53,15 @@ instance Semigroup Builder where 1# -> g arr r (len0 +# (off0 -# r)) s1 _ -> (# s1, (-1#) #) +instance Monoid Builder where + mempty = Builder $ \_ off0 _ s0 -> (# s0, off0 #) + -- | Run a builder. An accurate size hint is important for good performance. run :: Int -- ^ Hint for upper bound on size -> Builder -- ^ Builder -> ByteArray -run hint b = runST $ do +run hint b = runByteArrayST $ do let go !n = do arr <- newByteArray n pasteST b (MutableBytes arr 0 n) >>= \case @@ -56,6 +71,58 @@ run hint b = runST $ do unsafeFreezeByteArray arr go hint +pasteArrayIO :: + MutableBytes RealWorld -- ^ Buffer + -> (a -> Builder) -- ^ Builder + -> V.Vector a -- ^ Elements to serialize + -> IO (V.Vector a, MutableBytes RealWorld) -- ^ Shifted vector, shifted buffer +pasteArrayIO !arr f !xs = stToIO (pasteArrayST arr f xs) + +pasteArrayST :: + MutableBytes s -- ^ Buffer + -> (a -> Builder) -- ^ Builder + -> V.Vector a -- ^ Elements to serialize + -> ST s (V.Vector a, MutableBytes s) -- ^ Shifted vector, shifted buffer +pasteArrayST (MutableBytes arr off0 len0) f !xs0 = do + let go !xs !ixBufA !lenBufA = if V.length xs > 0 + then do + let a = V.unsafeHead xs + pasteST (f a) (MutableBytes arr ixBufA lenBufA) >>= \case + Nothing -> pure (xs,MutableBytes arr ixBufA lenBufA) + Just ixBufB -> + go (V.unsafeTail xs) ixBufB (lenBufA + (ixBufA - ixBufB)) + else pure (xs,MutableBytes arr ixBufA lenBufA) + go xs0 off0 len0 + +-- | Paste the builder into the byte array starting at offset zero. +-- This repeatedly reallocates the byte array if it cannot accomodate +-- the builder, replaying the builder each time. +pasteGrowST :: + Int -- ^ How many bytes to grow by at a time + -> Builder + -> MutableByteArray s + -- ^ Initial buffer, used linearly. Do not reuse this argument. + -> ST s (MutableByteArrayOffset s) + -- ^ Final buffer that accomodated the builder. +pasteGrowST !n b !arr0 = do + let go !arr !sz = pasteST b (MutableBytes arr 0 sz) >>= \case + Nothing -> do + let szNext = sz + n + arrNext <- PM.newByteArray szNext + go arrNext szNext + Just ix -> pure (MutableByteArrayOffset{array=arr,offset=ix}) + go arr0 =<< PM.getSizeofMutableByteArray arr0 + +-- | Variant of 'pasteGrowST' that runs in 'IO'. +pasteGrowIO :: + Int -- ^ How many bytes to grow by at a time + -> Builder + -> MutableByteArray RealWorld + -- ^ Initial buffer, used linearly. Do not reuse this argument. + -> IO (MutableByteArrayOffset RealWorld) + -- ^ Final buffer that accomodated the builder. +pasteGrowIO !n b !arr = stToIO (pasteGrowST n b arr) + pasteST :: Builder -> MutableBytes s -> ST s (Maybe Int) {-# inline pasteST #-} pasteST (Builder f) (MutableBytes (MutableByteArray arr) (I# off) (I# len)) = @@ -76,6 +143,14 @@ construct f = Builder Nothing -> (# s1, (-1#) #) Just (I# n) -> (# s1, n #) +fromUnsafe :: forall n. KnownNat n => Unsafe.Builder n -> Builder +{-# inline fromUnsafe #-} +fromUnsafe (Unsafe.Builder f) = Builder $ \arr off len s0 -> + case fromIntegral (natVal' (proxy# :: Proxy# n)) of + I# req -> case len >=# req of + 1# -> f arr off s0 + _ -> (# s0, (-1#) #) + bytearray :: ByteArray -> Builder bytearray a = bytes (Bytes a 0 (sizeofByteArray a)) @@ -87,43 +162,11 @@ bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len else pure Nothing word64Dec :: Word64 -> Builder -word64Dec (W64# w) = word64Dec# w +word64Dec w = fromUnsafe (Unsafe.word64Dec w) -word64Dec# :: Word# -> Builder -{-# noinline word64Dec# #-} -word64Dec# w# = construct $ \(MutableBytes arr off0 len) -> if len >= 19 - then if w /= 0 - then do - let go off x = if x > 0 - then do - let (y,z) = quotRem x 10 - writeByteArray arr off (fromIntegral (z + 0x30) :: Word8) - go (off + 1) y - else do - reverseBytes arr off0 (off - 1) - pure (Just off) - go off0 w - else do - writeByteArray arr off0 (c2w '0') - pure (Just (off0 + 1)) - else pure Nothing - where - w = W64# w# - -reverseBytes :: MutableByteArray s -> Int -> Int -> ST s () -{-# inline reverseBytes #-} -reverseBytes arr begin end = go begin end where - go ixA ixB = if ixA < ixB - then do - a :: Word8 <- readByteArray arr ixA - b :: Word8 <- readByteArray arr ixB - writeByteArray arr ixA b - writeByteArray arr ixB a - go (ixA + 1) (ixB - 1) - else pure () - -c2w :: Char -> Word8 -c2w = fromIntegral . ord +word64PaddedUpperHex :: Word64 -> Builder +word64PaddedUpperHex w = + fromUnsafe (Unsafe.word64PaddedUpperHex w) unST :: ST s a -> State# s -> (# State# s, a #) unST (ST f) = f diff --git a/src/Data/ByteArray/Builder/Small/Unsafe.hs b/src/Data/ByteArray/Builder/Small/Unsafe.hs new file mode 100644 index 0000000..36e071c --- /dev/null +++ b/src/Data/ByteArray/Builder/Small/Unsafe.hs @@ -0,0 +1,165 @@ +{-# language GADTSyntax #-} +{-# language KindSignatures #-} +{-# language ScopedTypeVariables #-} +{-# language BangPatterns #-} +{-# language MagicHash #-} +{-# language UnboxedTuples #-} +{-# language RankNTypes #-} +{-# language LambdaCase #-} +{-# language TypeOperators #-} +{-# language DataKinds #-} + +-- | The functions in this module do not check to +-- see if there is enough space in the buffer. +module Data.ByteArray.Builder.Small.Unsafe + ( Builder(..) + , run + , pasteST + , pasteIO + , construct + , append + , word64Dec + , word64PaddedUpperHex + ) where + +import Control.Monad.Primitive +import Control.Monad.ST +import Data.Bits +import Data.Bytes.Types +import Data.Char (ord) +import Data.Primitive +import GHC.Exts +import GHC.ST +import GHC.Word +import Data.Kind +import GHC.TypeLits (KnownNat,Nat,type (+),natVal') + +newtype Builder :: Nat -> Type where + Builder :: + (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)) + -> Builder n + +run :: forall n. KnownNat n + => Builder n -- ^ Builder + -> ByteArray +{-# inline run #-} +run b = runST $ do + arr <- newByteArray (fromIntegral (natVal' (proxy# :: Proxy# n))) + len <- pasteST b arr 0 + shrinkMutableByteArray arr len + unsafeFreezeByteArray arr + +-- | This function does not enforce the known upper bound on the +-- size. It is up to the user to do this. +pasteST :: Builder n -> MutableByteArray s -> Int -> ST s Int +{-# inline pasteST #-} +pasteST (Builder f) (MutableByteArray arr) (I# off) = + ST $ \s0 -> case f arr off s0 of + (# s1, r #) -> (# s1, (I# r) #) + +-- | This function does not enforce the known upper bound on the +-- size. It is up to the user to do this. +pasteIO :: Builder n -> MutableByteArray RealWorld -> Int -> IO Int +{-# inline pasteIO #-} +pasteIO b m off = stToIO (pasteST b m off) + +construct :: (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder n +{-# inline construct #-} +construct f = Builder + $ \arr off s0 -> + case unST (f (MutableByteArray arr) (I# off)) s0 of + (# s1, (I# n) #) -> (# s1, n #) + +append :: Builder n -> Builder m -> Builder (n + m) +append (Builder f) (Builder g) = + Builder $ \arr off0 s0 -> case f arr off0 s0 of + (# s1, r #) -> g arr r s1 + +-- | Requires up to 19 bytes. +word64Dec :: Word64 -> Builder 19 +word64Dec (W64# w) = word64Dec# w + +-- | Requires up to 19 bytes. +word64Dec# :: Word# -> Builder 19 +{-# noinline word64Dec# #-} +word64Dec# w# = construct $ \arr off0 -> if w /= 0 + then do + let go off x = if x > 0 + then do + let (y,z) = quotRem x 10 + writeByteArray arr off (fromIntegral (z + 0x30) :: Word8) + go (off + 1) y + else do + reverseBytes arr off0 (off - 1) + pure off + go off0 w + else do + writeByteArray arr off0 (c2w '0') + pure (off0 + 1) + where + w = W64# w# + +-- Convert a number between 0 and 16 to the ASCII +-- representation of its hexadecimal character. +-- The use of fromIntegral causes us to incur an +-- unneeded bitmask. This actually needs a Word64 +-- argument. +toHexUpper :: Word -> Word8 +toHexUpper w' = fromIntegral + $ (complement theMask .&. loSolved) + .|. (theMask .&. hiSolved) + where + w = w' .&. 0xF + -- This is all ones if the value was >= 10 + theMask = (1 .&. unsafeShiftR (w - 10) 63) - 1 + loSolved = w + 48 + hiSolved = w + 55 + +-- | Requires up to 16 bytes. +word64PaddedUpperHex :: Word64 -> Builder 16 +word64PaddedUpperHex (W64# w) = word64PaddedUpperHex# w + +word64PaddedUpperHex# :: Word# -> Builder 16 +{-# noinline word64PaddedUpperHex# #-} +word64PaddedUpperHex# w# = construct $ \arr off -> do + writeByteArray arr off (toHexUpper (unsafeShiftR w 60)) + writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 56)) + writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 52)) + writeByteArray arr (off + 3) (toHexUpper (unsafeShiftR w 48)) + writeByteArray arr (off + 4) (toHexUpper (unsafeShiftR w 44)) + writeByteArray arr (off + 5) (toHexUpper (unsafeShiftR w 40)) + writeByteArray arr (off + 6) (toHexUpper (unsafeShiftR w 36)) + writeByteArray arr (off + 7) (toHexUpper (unsafeShiftR w 32)) + writeByteArray arr (off + 8) (toHexUpper (unsafeShiftR w 28)) + writeByteArray arr (off + 9) (toHexUpper (unsafeShiftR w 24)) + writeByteArray arr (off + 10) (toHexUpper (unsafeShiftR w 20)) + writeByteArray arr (off + 11) (toHexUpper (unsafeShiftR w 16)) + writeByteArray arr (off + 12) (toHexUpper (unsafeShiftR w 12)) + writeByteArray arr (off + 13) (toHexUpper (unsafeShiftR w 8)) + writeByteArray arr (off + 14) (toHexUpper (unsafeShiftR w 4)) + writeByteArray arr (off + 15) (toHexUpper (unsafeShiftR w 0)) + pure (off + 16) + where + w = W# w# + +reverseBytes :: MutableByteArray s -> Int -> Int -> ST s () +{-# inline reverseBytes #-} +reverseBytes arr begin end = go begin end where + go ixA ixB = if ixA < ixB + then do + a :: Word8 <- readByteArray arr ixA + b :: Word8 <- readByteArray arr ixB + writeByteArray arr ixA b + writeByteArray arr ixB a + go (ixA + 1) (ixB - 1) + else pure () + +c2w :: Char -> Word8 +c2w = fromIntegral . ord + +unST :: ST s a -> State# s -> (# State# s, a #) +unST (ST f) = f + +shrinkMutableByteArray :: MutableByteArray s -> Int -> ST s () +shrinkMutableByteArray (MutableByteArray arr) (I# sz) = + primitive_ (shrinkMutableByteArray# arr sz) diff --git a/test/Main.hs b/test/Main.hs index 672a4a9..0b40096 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -1,11 +1,20 @@ +{-# language BangPatterns #-} +{-# language ScopedTypeVariables #-} {-# language TypeApplications #-} +import Control.Monad.ST (runST) +import Data.Bytes.Types (MutableBytes(..)) import Data.ByteArray.Builder.Small import Data.Word import Data.Char (ord) import Data.Primitive (ByteArray) +import Debug.Trace import Test.Tasty (defaultMain,testGroup,TestTree) import Test.QuickCheck ((===)) +import Text.Printf (printf) +import qualified Data.Primitive as PM +import qualified Data.List as L +import qualified Data.Vector as V import qualified Test.Tasty.QuickCheck as TQC import qualified Test.QuickCheck as QC import qualified GHC.Exts as Exts @@ -21,8 +30,35 @@ tests = testGroup "Tests" run 1 (word64Dec x <> word64Dec y <> word64Dec z) === pack (show x ++ show y ++ show z) + , TQC.testProperty "word64PaddedUpperHex" $ \w -> + run 1 (word64PaddedUpperHex w) + === + pack (showWord64PaddedUpperHex w) + , TQC.testProperty "pasteArrayST" $ \(xs :: [Word64]) -> + (runArray word64Dec (V.fromList xs)) + === + pack (foldMap show xs) ] pack :: String -> ByteArray pack = Exts.fromList . map (fromIntegral @Int @Word8 . ord) +-- This is used to test pasteArrayST +runArray :: + (a -> Builder) -- ^ Builder + -> V.Vector a -- ^ Elements to serialize + -> ByteArray -- ^ Number of elements serialized, serialization +runArray f !xs = runST $ do + let go !v0 !sz !chunks = if V.null v0 + then pure (mconcat (L.reverse chunks)) + else do + arr <- PM.newByteArray sz + (v1,MutableBytes _ off _) <- pasteArrayST (MutableBytes arr 0 sz) f v0 + -- If nothing was serialized, we need a bigger buffer + let szNext = if V.length v0 == V.length v1 then sz + 1 else sz + c <- PM.unsafeFreezeByteArray =<< PM.resizeMutableByteArray arr off + go v1 szNext (c : chunks) + go xs 1 [] + +showWord64PaddedUpperHex :: Word64 -> String +showWord64PaddedUpperHex = printf "%016X"