add more features

This commit is contained in:
Andrew Martin 2019-06-25 15:18:34 -04:00
parent 0b1a585add
commit 831c25c81a
4 changed files with 322 additions and 48 deletions

View file

@ -1,10 +1,26 @@
cabal-version: 2.2
name: small-bytearray-builder
version: 0.1.0.0
-- synopsis:
-- description:
synopsis: Serialize to a small byte arrays
description:
This is similar to the builder facilities provided by
`Data.ByteString.Builder`. It is intended to be used in
situations where the following apply:
.
* An individual entity will be serialized as a small
number of bytes (less than 512).
.
* A large number (more than 32) of entities will be serialized
one after another without anything between them.
.
Unlike builders from the `bytestring` package, these builders
do not track their state when they run out of space. A builder
that runs out of space simply aborts and is rerun at the beginning
of the next chunk. This strategy for building is suitable for most
CSVs and several line protocols (carbon, InfluxDB, etc.).
homepage: https://github.com/andrewthad/small-bytearray-builder
-- bug-reports:
bug-reports: https://github.com/andrewthad/small-bytearray-builder/issues
license: BSD-3-Clause
license-file: LICENSE
author: Andrew Martin
@ -13,16 +29,28 @@ copyright: 2019 Andrew Martin
category: Data
extra-source-files: CHANGELOG.md
flag checked
manual: True
description: Add bounds-checking to primitive array operations
default: False
library
exposed-modules:
Data.ByteArray.Builder.Small
Data.ByteArray.Builder.Small.Unsafe
-- other-modules:
-- other-extensions:
build-depends:
base >=4.12.0.0 && <5
, primitive >=0.7 && <0.8
, byteslice >=0.1 && <0.2
ghc-options: -Wall -O2 -ddump-to-file -ddump-simpl
, primitive-offset >=0.2 && <0.3
, run-st >=0.1 && <0.2
, vector >=0.12.0.3 && <0.13
if flag(checked)
build-depends: primitive-checked >= 0.7 && <0.8
else
build-depends: primitive >= 0.7 && <0.8
ghc-options: -Wall -O2 -ddump-to-file -ddump-simpl -dsuppress-all
hs-source-dirs: src
default-language: Haskell2010
@ -33,8 +61,10 @@ test-suite test
main-is: Main.hs
build-depends:
base >=4.12.0.0 && <5
, byteslice
, small-bytearray-builder
, QuickCheck >=2.13.1 && <2.14
, tasty-quickcheck >=0.10.1 && <0.11
, tasty >=1.2.3 && <1.3
, primitive
, vector

View file

@ -1,9 +1,10 @@
{-# language ScopedTypeVariables #-}
{-# language BangPatterns #-}
{-# language MagicHash #-}
{-# language UnboxedTuples #-}
{-# language RankNTypes #-}
{-# language DuplicateRecordFields #-}
{-# language LambdaCase #-}
{-# language MagicHash #-}
{-# language RankNTypes #-}
{-# language ScopedTypeVariables #-}
{-# language UnboxedTuples #-}
module Data.ByteArray.Builder.Small
( -- * Unsafe Primitives
@ -13,21 +14,32 @@ module Data.ByteArray.Builder.Small
, run
, pasteST
, pasteIO
, pasteGrowST
, pasteGrowIO
, pasteArrayST
, pasteArrayIO
-- * Materialized Byte Sequences
, bytes
, bytearray
-- * Numbers
, word64Dec
, word64PaddedUpperHex
) where
import Control.Monad.Primitive
import Control.Monad.ST
import Control.Monad.ST.Run (runByteArrayST)
import Data.Bytes.Types
import Data.Char (ord)
import Data.Primitive
import GHC.Exts
import GHC.ST
import GHC.Word
import GHC.TypeLits (KnownNat,natVal')
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
import qualified Data.Primitive as PM
import qualified Data.Vector as V
import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
-- | An unmaterialized sequence of bytes that may be pasted
-- into a mutable byte array.
@ -41,12 +53,15 @@ instance Semigroup Builder where
1# -> g arr r (len0 +# (off0 -# r)) s1
_ -> (# s1, (-1#) #)
instance Monoid Builder where
mempty = Builder $ \_ off0 _ s0 -> (# s0, off0 #)
-- | Run a builder. An accurate size hint is important for good performance.
run ::
Int -- ^ Hint for upper bound on size
-> Builder -- ^ Builder
-> ByteArray
run hint b = runST $ do
run hint b = runByteArrayST $ do
let go !n = do
arr <- newByteArray n
pasteST b (MutableBytes arr 0 n) >>= \case
@ -56,6 +71,58 @@ run hint b = runST $ do
unsafeFreezeByteArray arr
go hint
pasteArrayIO ::
MutableBytes RealWorld -- ^ Buffer
-> (a -> Builder) -- ^ Builder
-> V.Vector a -- ^ Elements to serialize
-> IO (V.Vector a, MutableBytes RealWorld) -- ^ Shifted vector, shifted buffer
pasteArrayIO !arr f !xs = stToIO (pasteArrayST arr f xs)
pasteArrayST ::
MutableBytes s -- ^ Buffer
-> (a -> Builder) -- ^ Builder
-> V.Vector a -- ^ Elements to serialize
-> ST s (V.Vector a, MutableBytes s) -- ^ Shifted vector, shifted buffer
pasteArrayST (MutableBytes arr off0 len0) f !xs0 = do
let go !xs !ixBufA !lenBufA = if V.length xs > 0
then do
let a = V.unsafeHead xs
pasteST (f a) (MutableBytes arr ixBufA lenBufA) >>= \case
Nothing -> pure (xs,MutableBytes arr ixBufA lenBufA)
Just ixBufB ->
go (V.unsafeTail xs) ixBufB (lenBufA + (ixBufA - ixBufB))
else pure (xs,MutableBytes arr ixBufA lenBufA)
go xs0 off0 len0
-- | Paste the builder into the byte array starting at offset zero.
-- This repeatedly reallocates the byte array if it cannot accomodate
-- the builder, replaying the builder each time.
pasteGrowST ::
Int -- ^ How many bytes to grow by at a time
-> Builder
-> MutableByteArray s
-- ^ Initial buffer, used linearly. Do not reuse this argument.
-> ST s (MutableByteArrayOffset s)
-- ^ Final buffer that accomodated the builder.
pasteGrowST !n b !arr0 = do
let go !arr !sz = pasteST b (MutableBytes arr 0 sz) >>= \case
Nothing -> do
let szNext = sz + n
arrNext <- PM.newByteArray szNext
go arrNext szNext
Just ix -> pure (MutableByteArrayOffset{array=arr,offset=ix})
go arr0 =<< PM.getSizeofMutableByteArray arr0
-- | Variant of 'pasteGrowST' that runs in 'IO'.
pasteGrowIO ::
Int -- ^ How many bytes to grow by at a time
-> Builder
-> MutableByteArray RealWorld
-- ^ Initial buffer, used linearly. Do not reuse this argument.
-> IO (MutableByteArrayOffset RealWorld)
-- ^ Final buffer that accomodated the builder.
pasteGrowIO !n b !arr = stToIO (pasteGrowST n b arr)
pasteST :: Builder -> MutableBytes s -> ST s (Maybe Int)
{-# inline pasteST #-}
pasteST (Builder f) (MutableBytes (MutableByteArray arr) (I# off) (I# len)) =
@ -76,6 +143,14 @@ construct f = Builder
Nothing -> (# s1, (-1#) #)
Just (I# n) -> (# s1, n #)
fromUnsafe :: forall n. KnownNat n => Unsafe.Builder n -> Builder
{-# inline fromUnsafe #-}
fromUnsafe (Unsafe.Builder f) = Builder $ \arr off len s0 ->
case fromIntegral (natVal' (proxy# :: Proxy# n)) of
I# req -> case len >=# req of
1# -> f arr off s0
_ -> (# s0, (-1#) #)
bytearray :: ByteArray -> Builder
bytearray a = bytes (Bytes a 0 (sizeofByteArray a))
@ -87,43 +162,11 @@ bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len
else pure Nothing
word64Dec :: Word64 -> Builder
word64Dec (W64# w) = word64Dec# w
word64Dec w = fromUnsafe (Unsafe.word64Dec w)
word64Dec# :: Word# -> Builder
{-# noinline word64Dec# #-}
word64Dec# w# = construct $ \(MutableBytes arr off0 len) -> if len >= 19
then if w /= 0
then do
let go off x = if x > 0
then do
let (y,z) = quotRem x 10
writeByteArray arr off (fromIntegral (z + 0x30) :: Word8)
go (off + 1) y
else do
reverseBytes arr off0 (off - 1)
pure (Just off)
go off0 w
else do
writeByteArray arr off0 (c2w '0')
pure (Just (off0 + 1))
else pure Nothing
where
w = W64# w#
reverseBytes :: MutableByteArray s -> Int -> Int -> ST s ()
{-# inline reverseBytes #-}
reverseBytes arr begin end = go begin end where
go ixA ixB = if ixA < ixB
then do
a :: Word8 <- readByteArray arr ixA
b :: Word8 <- readByteArray arr ixB
writeByteArray arr ixA b
writeByteArray arr ixB a
go (ixA + 1) (ixB - 1)
else pure ()
c2w :: Char -> Word8
c2w = fromIntegral . ord
word64PaddedUpperHex :: Word64 -> Builder
word64PaddedUpperHex w =
fromUnsafe (Unsafe.word64PaddedUpperHex w)
unST :: ST s a -> State# s -> (# State# s, a #)
unST (ST f) = f

View file

@ -0,0 +1,165 @@
{-# language GADTSyntax #-}
{-# language KindSignatures #-}
{-# language ScopedTypeVariables #-}
{-# language BangPatterns #-}
{-# language MagicHash #-}
{-# language UnboxedTuples #-}
{-# language RankNTypes #-}
{-# language LambdaCase #-}
{-# language TypeOperators #-}
{-# language DataKinds #-}
-- | The functions in this module do not check to
-- see if there is enough space in the buffer.
module Data.ByteArray.Builder.Small.Unsafe
( Builder(..)
, run
, pasteST
, pasteIO
, construct
, append
, word64Dec
, word64PaddedUpperHex
) where
import Control.Monad.Primitive
import Control.Monad.ST
import Data.Bits
import Data.Bytes.Types
import Data.Char (ord)
import Data.Primitive
import GHC.Exts
import GHC.ST
import GHC.Word
import Data.Kind
import GHC.TypeLits (KnownNat,Nat,type (+),natVal')
newtype Builder :: Nat -> Type where
Builder ::
(forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #))
-> Builder n
run :: forall n. KnownNat n
=> Builder n -- ^ Builder
-> ByteArray
{-# inline run #-}
run b = runST $ do
arr <- newByteArray (fromIntegral (natVal' (proxy# :: Proxy# n)))
len <- pasteST b arr 0
shrinkMutableByteArray arr len
unsafeFreezeByteArray arr
-- | This function does not enforce the known upper bound on the
-- size. It is up to the user to do this.
pasteST :: Builder n -> MutableByteArray s -> Int -> ST s Int
{-# inline pasteST #-}
pasteST (Builder f) (MutableByteArray arr) (I# off) =
ST $ \s0 -> case f arr off s0 of
(# s1, r #) -> (# s1, (I# r) #)
-- | This function does not enforce the known upper bound on the
-- size. It is up to the user to do this.
pasteIO :: Builder n -> MutableByteArray RealWorld -> Int -> IO Int
{-# inline pasteIO #-}
pasteIO b m off = stToIO (pasteST b m off)
construct :: (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder n
{-# inline construct #-}
construct f = Builder
$ \arr off s0 ->
case unST (f (MutableByteArray arr) (I# off)) s0 of
(# s1, (I# n) #) -> (# s1, n #)
append :: Builder n -> Builder m -> Builder (n + m)
append (Builder f) (Builder g) =
Builder $ \arr off0 s0 -> case f arr off0 s0 of
(# s1, r #) -> g arr r s1
-- | Requires up to 19 bytes.
word64Dec :: Word64 -> Builder 19
word64Dec (W64# w) = word64Dec# w
-- | Requires up to 19 bytes.
word64Dec# :: Word# -> Builder 19
{-# noinline word64Dec# #-}
word64Dec# w# = construct $ \arr off0 -> if w /= 0
then do
let go off x = if x > 0
then do
let (y,z) = quotRem x 10
writeByteArray arr off (fromIntegral (z + 0x30) :: Word8)
go (off + 1) y
else do
reverseBytes arr off0 (off - 1)
pure off
go off0 w
else do
writeByteArray arr off0 (c2w '0')
pure (off0 + 1)
where
w = W64# w#
-- Convert a number between 0 and 16 to the ASCII
-- representation of its hexadecimal character.
-- The use of fromIntegral causes us to incur an
-- unneeded bitmask. This actually needs a Word64
-- argument.
toHexUpper :: Word -> Word8
toHexUpper w' = fromIntegral
$ (complement theMask .&. loSolved)
.|. (theMask .&. hiSolved)
where
w = w' .&. 0xF
-- This is all ones if the value was >= 10
theMask = (1 .&. unsafeShiftR (w - 10) 63) - 1
loSolved = w + 48
hiSolved = w + 55
-- | Requires up to 16 bytes.
word64PaddedUpperHex :: Word64 -> Builder 16
word64PaddedUpperHex (W64# w) = word64PaddedUpperHex# w
word64PaddedUpperHex# :: Word# -> Builder 16
{-# noinline word64PaddedUpperHex# #-}
word64PaddedUpperHex# w# = construct $ \arr off -> do
writeByteArray arr off (toHexUpper (unsafeShiftR w 60))
writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 56))
writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 52))
writeByteArray arr (off + 3) (toHexUpper (unsafeShiftR w 48))
writeByteArray arr (off + 4) (toHexUpper (unsafeShiftR w 44))
writeByteArray arr (off + 5) (toHexUpper (unsafeShiftR w 40))
writeByteArray arr (off + 6) (toHexUpper (unsafeShiftR w 36))
writeByteArray arr (off + 7) (toHexUpper (unsafeShiftR w 32))
writeByteArray arr (off + 8) (toHexUpper (unsafeShiftR w 28))
writeByteArray arr (off + 9) (toHexUpper (unsafeShiftR w 24))
writeByteArray arr (off + 10) (toHexUpper (unsafeShiftR w 20))
writeByteArray arr (off + 11) (toHexUpper (unsafeShiftR w 16))
writeByteArray arr (off + 12) (toHexUpper (unsafeShiftR w 12))
writeByteArray arr (off + 13) (toHexUpper (unsafeShiftR w 8))
writeByteArray arr (off + 14) (toHexUpper (unsafeShiftR w 4))
writeByteArray arr (off + 15) (toHexUpper (unsafeShiftR w 0))
pure (off + 16)
where
w = W# w#
reverseBytes :: MutableByteArray s -> Int -> Int -> ST s ()
{-# inline reverseBytes #-}
reverseBytes arr begin end = go begin end where
go ixA ixB = if ixA < ixB
then do
a :: Word8 <- readByteArray arr ixA
b :: Word8 <- readByteArray arr ixB
writeByteArray arr ixA b
writeByteArray arr ixB a
go (ixA + 1) (ixB - 1)
else pure ()
c2w :: Char -> Word8
c2w = fromIntegral . ord
unST :: ST s a -> State# s -> (# State# s, a #)
unST (ST f) = f
shrinkMutableByteArray :: MutableByteArray s -> Int -> ST s ()
shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
primitive_ (shrinkMutableByteArray# arr sz)

View file

@ -1,11 +1,20 @@
{-# language BangPatterns #-}
{-# language ScopedTypeVariables #-}
{-# language TypeApplications #-}
import Control.Monad.ST (runST)
import Data.Bytes.Types (MutableBytes(..))
import Data.ByteArray.Builder.Small
import Data.Word
import Data.Char (ord)
import Data.Primitive (ByteArray)
import Debug.Trace
import Test.Tasty (defaultMain,testGroup,TestTree)
import Test.QuickCheck ((===))
import Text.Printf (printf)
import qualified Data.Primitive as PM
import qualified Data.List as L
import qualified Data.Vector as V
import qualified Test.Tasty.QuickCheck as TQC
import qualified Test.QuickCheck as QC
import qualified GHC.Exts as Exts
@ -21,8 +30,35 @@ tests = testGroup "Tests"
run 1 (word64Dec x <> word64Dec y <> word64Dec z)
===
pack (show x ++ show y ++ show z)
, TQC.testProperty "word64PaddedUpperHex" $ \w ->
run 1 (word64PaddedUpperHex w)
===
pack (showWord64PaddedUpperHex w)
, TQC.testProperty "pasteArrayST" $ \(xs :: [Word64]) ->
(runArray word64Dec (V.fromList xs))
===
pack (foldMap show xs)
]
pack :: String -> ByteArray
pack = Exts.fromList . map (fromIntegral @Int @Word8 . ord)
-- This is used to test pasteArrayST
runArray ::
(a -> Builder) -- ^ Builder
-> V.Vector a -- ^ Elements to serialize
-> ByteArray -- ^ Number of elements serialized, serialization
runArray f !xs = runST $ do
let go !v0 !sz !chunks = if V.null v0
then pure (mconcat (L.reverse chunks))
else do
arr <- PM.newByteArray sz
(v1,MutableBytes _ off _) <- pasteArrayST (MutableBytes arr 0 sz) f v0
-- If nothing was serialized, we need a bigger buffer
let szNext = if V.length v0 == V.length v1 then sz + 1 else sz
c <- PM.unsafeFreezeByteArray =<< PM.resizeMutableByteArray arr off
go v1 szNext (c : chunks)
go xs 1 []
showWord64PaddedUpperHex :: Word64 -> String
showWord64PaddedUpperHex = printf "%016X"