bytebuild/src/Data/ByteArray/Builder.hs

491 lines
18 KiB
Haskell
Raw Normal View History

{-# language BangPatterns #-}
2019-06-25 15:18:34 -04:00
{-# language DuplicateRecordFields #-}
{-# language LambdaCase #-}
{-# language MagicHash #-}
{-# language RankNTypes #-}
2019-06-25 15:18:34 -04:00
{-# language ScopedTypeVariables #-}
{-# language UnboxedTuples #-}
module Data.ByteArray.Builder
( -- * Bounded Primitives
Builder
, construct
, fromBounded
-- * Evaluation
, run
, pasteST
2019-06-17 13:56:14 -04:00
, pasteIO
2019-06-25 15:18:34 -04:00
, pasteGrowST
, pasteGrowIO
, pasteArrayST
, pasteArrayIO
2019-06-17 13:56:14 -04:00
-- * Materialized Byte Sequences
, bytes
, bytearray
, shortTextUtf8
, shortTextJsonString
, cstring
, stringUtf8
-- * Encode Integral Types
-- ** Human-Readable
, word64Dec
2019-09-01 19:17:29 -04:00
, word32Dec
, word16Dec
2019-09-04 10:30:23 -04:00
, word8Dec
2019-09-04 14:34:33 -04:00
, wordDec
, int64Dec
2019-09-04 10:49:14 -04:00
, int32Dec
, int16Dec
, int8Dec
2019-09-04 10:30:23 -04:00
, intDec
-- * Unsigned Words
-- ** 64-bit
2019-06-25 15:18:34 -04:00
, word64PaddedUpperHex
-- ** 32-bit
2019-06-25 17:09:26 -04:00
, word32PaddedUpperHex
-- ** 16-bit
, word16PaddedUpperHex
, word16PaddedLowerHex
, word16LowerHex
, word16UpperHex
-- ** 8-bit
, word8PaddedUpperHex
, word8LowerHex
, ascii
, char
-- ** Machine-Readable
, word64BE
, word32BE
, word16BE
, word8
2019-09-18 11:45:02 -04:00
-- ** Prefixing with Length
, consLength32BE
2019-09-19 12:10:13 -04:00
, consLength64BE
-- * Encode Floating-Point Types
-- ** Human-Readable
, doubleDec
) where
import Control.Monad.Primitive (primitive_)
import Control.Monad.ST (ST,stToIO)
2019-06-25 15:18:34 -04:00
import Control.Monad.ST.Run (runByteArrayST)
import Data.ByteArray.Builder.Unsafe (Builder(Builder))
import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring)
import Data.ByteString.Short.Internal (ShortByteString(SBS))
import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes))
import Data.Char (ord)
2019-09-04 10:49:14 -04:00
import Data.Int (Int64,Int32,Int16,Int8)
import Data.Primitive (ByteArray(..),MutableByteArray(..))
2019-06-25 15:18:34 -04:00
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
import Data.Text.Short (ShortText)
import Data.Word (Word64,Word32,Word16,Word8)
import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,RealWorld,(>=#),(/=#))
2019-09-18 11:45:02 -04:00
import GHC.Exts ((+#),(-#))
import GHC.ST (ST(ST))
2019-06-25 15:18:34 -04:00
import qualified Arithmetic.Nat as Nat
import qualified Arithmetic.Types as Arithmetic
import qualified GHC.Exts as Exts
import qualified Data.Text.Short as TS
2019-06-25 15:18:34 -04:00
import qualified Data.Primitive as PM
import qualified Data.Vector as V
import qualified Data.ByteArray.Builder.Bounded as Bounded
import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded
2019-09-01 19:17:29 -04:00
-- | Run a builder. An accurate size hint is important for good performance.
2019-06-25 17:09:26 -04:00
-- The size hint should be slightly larger than the actual size.
run ::
Int -- ^ Hint for upper bound on size
-> Builder -- ^ Builder
-> ByteArray
2019-06-25 15:18:34 -04:00
run hint b = runByteArrayST $ do
let go !n = do
arr <- PM.newByteArray n
pasteST b (MutableBytes arr 0 n) >>= \case
Nothing -> go (n + 64)
Just len -> do
shrinkMutableByteArray arr len
PM.unsafeFreezeByteArray arr
go (max hint 1)
2019-06-25 17:09:26 -04:00
-- | Variant of 'pasteArrayST' that runs in 'IO'.
2019-06-25 15:18:34 -04:00
pasteArrayIO ::
MutableBytes RealWorld -- ^ Buffer
-> (a -> Builder) -- ^ Builder
-> V.Vector a -- ^ Elements to serialize
-> IO (V.Vector a, MutableBytes RealWorld) -- ^ Shifted vector, shifted buffer
pasteArrayIO !arr f !xs = stToIO (pasteArrayST arr f xs)
2019-06-25 17:09:26 -04:00
-- | Fold over a vector, applying the builder to each element until
-- the buffer cannot accomodate any more.
2019-06-25 15:18:34 -04:00
pasteArrayST ::
MutableBytes s -- ^ Buffer
-> (a -> Builder) -- ^ Builder
-> V.Vector a -- ^ Elements to serialize
-> ST s (V.Vector a, MutableBytes s) -- ^ Shifted vector, shifted buffer
pasteArrayST (MutableBytes arr off0 len0) f !xs0 = do
let go !xs !ixBufA !lenBufA = if V.length xs > 0
then do
let a = V.unsafeHead xs
pasteST (f a) (MutableBytes arr ixBufA lenBufA) >>= \case
Nothing -> pure (xs,MutableBytes arr ixBufA lenBufA)
Just ixBufB ->
go (V.unsafeTail xs) ixBufB (lenBufA + (ixBufA - ixBufB))
else pure (xs,MutableBytes arr ixBufA lenBufA)
go xs0 off0 len0
-- | Paste the builder into the byte array starting at offset zero.
-- This repeatedly reallocates the byte array if it cannot accomodate
-- the builder, replaying the builder each time.
pasteGrowST ::
Int -- ^ How many bytes to grow by at a time
-> Builder
-> MutableByteArrayOffset s
2019-06-25 15:18:34 -04:00
-- ^ Initial buffer, used linearly. Do not reuse this argument.
-> ST s (MutableByteArrayOffset s)
-- ^ Final buffer that accomodated the builder.
pasteGrowST !n b !(MutableByteArrayOffset arr0 off0) = do
let go !arr !sz = pasteST b (MutableBytes arr off0 (sz - off0)) >>= \case
2019-06-25 15:18:34 -04:00
Nothing -> do
let szNext = sz + n
arrNext <- PM.resizeMutableByteArray arr szNext
2019-06-25 15:18:34 -04:00
go arrNext szNext
Just ix -> pure (MutableByteArrayOffset{array=arr,offset=ix})
go arr0 =<< PM.getSizeofMutableByteArray arr0
-- | Variant of 'pasteGrowST' that runs in 'IO'.
pasteGrowIO ::
Int -- ^ How many bytes to grow by at a time
-> Builder
-> MutableByteArrayOffset RealWorld
2019-06-25 15:18:34 -04:00
-- ^ Initial buffer, used linearly. Do not reuse this argument.
-> IO (MutableByteArrayOffset RealWorld)
-- ^ Final buffer that accomodated the builder.
pasteGrowIO !n b !arr = stToIO (pasteGrowST n b arr)
2019-06-25 17:09:26 -04:00
-- | Execute the builder, pasting its contents into a buffer.
-- If the buffer is not large enough, this returns 'Nothing'.
-- Otherwise, it returns the index in the buffer that follows
-- the payload just written.
pasteST :: Builder -> MutableBytes s -> ST s (Maybe Int)
{-# inline pasteST #-}
pasteST (Builder f) (MutableBytes (MutableByteArray arr) (I# off) (I# len)) =
ST $ \s0 -> case f arr off len s0 of
(# s1, r #) -> if Exts.isTrue# (r /=# (-1#))
then (# s1, Just (I# r) #)
else (# s1, Nothing #)
2019-06-25 17:09:26 -04:00
-- | Variant of 'pasteST' that runs in 'IO'.
2019-06-17 13:56:14 -04:00
pasteIO :: Builder -> MutableBytes RealWorld -> IO (Maybe Int)
{-# inline pasteIO #-}
pasteIO b m = stToIO (pasteST b m)
2019-06-25 17:09:26 -04:00
-- | Constructor for 'Builder' that works on a function with lifted
-- arguments instead of unlifted ones. This is just as unsafe as the
-- actual constructor.
construct :: (forall s. MutableBytes s -> ST s (Maybe Int)) -> Builder
construct f = Builder
$ \arr off len s0 ->
case unST (f (MutableBytes (MutableByteArray arr) (I# off) (I# len))) s0 of
(# s1, m #) -> case m of
Nothing -> (# s1, (-1#) #)
Just (I# n) -> (# s1, n #)
-- | Convert a bounded builder to an unbounded one. If the size
-- is a constant, use @Arithmetic.Nat.constant@ as the first argument
-- to let GHC conjure up this value for you.
fromBounded ::
Arithmetic.Nat n
-> Bounded.Builder n
-> Builder
{-# inline fromBounded #-}
fromBounded n (UnsafeBounded.Builder f) = Builder $ \arr off len s0 ->
let !(I# req) = Nat.demote n in
case len >=# req of
1# -> f arr off s0
_ -> (# s0, (-1#) #)
2019-06-25 15:18:34 -04:00
2019-06-25 17:09:26 -04:00
-- | Create a builder from an unsliced byte sequence.
2019-06-17 13:56:14 -04:00
bytearray :: ByteArray -> Builder
bytearray a = bytes (Bytes a 0 (PM.sizeofByteArray a))
2019-06-17 13:56:14 -04:00
2019-06-25 17:09:26 -04:00
-- | Create a builder from a sliced byte sequence.
2019-06-17 13:56:14 -04:00
bytes :: Bytes -> Builder
bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len >= slen
then do
PM.copyByteArray arr off src soff slen
pure (Just (off + slen))
2019-06-17 13:56:14 -04:00
else pure Nothing
-- Internal function. Precondition, the referenced slice of the
-- byte sequence is UTF-8 encoded text.
slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
{-# inline slicedUtf8TextJson #-}
slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 dlen0) ->
let slen0 = I# slen0#
in if dlen0 > (2 * slen0) + 2
then do
PM.writeByteArray dst doff0 (c2w '"')
let go !soff !slen !doff = if slen > 0
then case indexChar8Array (ByteArray src#) soff of
'\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
'\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
'\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
'\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
'\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
c -> if c >= '\x20'
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
else do
write2 dst doff '\\' 'u'
doff' <- UnsafeBounded.pasteST
(Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
dst (doff + 2)
go (soff + 1) (slen - 1) doff'
else pure doff
doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
PM.writeByteArray dst doffRes (c2w '"')
pure (Just (doffRes + 1))
else pure Nothing
-- Internal. Write two characters in the ASCII plane to a byte array.
write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
write2 marr ix a b = do
PM.writeByteArray marr ix (c2w a)
PM.writeByteArray marr (ix + 1) (c2w b)
-- | Create a builder from text. The text will be UTF-8 encoded.
shortTextUtf8 :: ShortText -> Builder
shortTextUtf8 a =
let ba = shortTextToByteArray a
in bytes (Bytes ba 0 (PM.sizeofByteArray ba))
-- | Create a builder from text. The text will be UTF-8 encoded,
-- and JSON special characters will be escaped. Additionally, the
-- result is surrounded by double quotes. For example:
--
-- * @foo ==> "foo"@
-- * @\_"_/ ==> "\\_\"_/"@
-- * @hello<ESC>world ==> "hello\u001Bworld"@ (where <LF> is code point 0x1B)
shortTextJsonString :: ShortText -> Builder
shortTextJsonString a =
let !(ByteArray ba) = shortTextToByteArray a
!(I# len) = PM.sizeofByteArray (ByteArray ba)
in slicedUtf8TextJson ba 0# len
2019-06-25 17:09:26 -04:00
-- | Encodes an unsigned 64-bit integer as decimal.
-- This encoding never starts with a zero unless the
-- argument was zero.
word64Dec :: Word64 -> Builder
word64Dec w = fromBounded Nat.constant (Bounded.word64Dec w)
2019-06-25 15:18:34 -04:00
-- | Encodes an unsigned 16-bit integer as decimal.
-- This encoding never starts with a zero unless the
-- argument was zero.
2019-09-01 19:17:29 -04:00
word32Dec :: Word32 -> Builder
word32Dec w = fromBounded Nat.constant (Bounded.word32Dec w)
2019-09-01 19:17:29 -04:00
-- | Encodes an unsigned 16-bit integer as decimal.
-- This encoding never starts with a zero unless the
-- argument was zero.
word16Dec :: Word16 -> Builder
word16Dec w = fromBounded Nat.constant (Bounded.word16Dec w)
2019-09-04 10:30:23 -04:00
-- | Encodes an unsigned 8-bit integer as decimal.
-- This encoding never starts with a zero unless the
-- argument was zero.
word8Dec :: Word8 -> Builder
word8Dec w = fromBounded Nat.constant (Bounded.word8Dec w)
2019-09-04 14:34:33 -04:00
-- | Encodes an unsigned machine-sized integer as decimal.
-- This encoding never starts with a zero unless the
-- argument was zero.
wordDec :: Word -> Builder
wordDec w = fromBounded Nat.constant (Bounded.wordDec w)
-- | Encode a double-floating-point number, using decimal notation or
-- scientific notation depending on the magnitude. This has undefined
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
-- crash, but the generated numbers will be nonsense.
doubleDec :: Double -> Builder
doubleDec w = fromBounded Nat.constant (Bounded.doubleDec w)
-- | Encodes a signed 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int64Dec :: Int64 -> Builder
int64Dec w = fromBounded Nat.constant (Bounded.int64Dec w)
2019-09-04 10:49:14 -04:00
-- | Encodes a signed 32-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int32Dec :: Int32 -> Builder
int32Dec w = fromBounded Nat.constant (Bounded.int32Dec w)
-- | Encodes a signed 16-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int16Dec :: Int16 -> Builder
int16Dec w = fromBounded Nat.constant (Bounded.int16Dec w)
-- | Encodes a signed 8-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int8Dec :: Int8 -> Builder
int8Dec w = fromBounded Nat.constant (Bounded.int8Dec w)
2019-09-04 10:30:23 -04:00
-- | Encodes a signed machine-sized integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
intDec :: Int -> Builder
intDec w = fromBounded Nat.constant (Bounded.intDec w)
2019-06-25 17:09:26 -04:00
-- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 16 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 1022 as @00000000000003FE@.
2019-06-25 15:18:34 -04:00
word64PaddedUpperHex :: Word64 -> Builder
word64PaddedUpperHex w =
fromBounded Nat.constant (Bounded.word64PaddedUpperHex w)
2019-06-25 17:09:26 -04:00
-- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 8 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 1022 as @000003FE@.
word32PaddedUpperHex :: Word32 -> Builder
word32PaddedUpperHex w =
fromBounded Nat.constant (Bounded.word32PaddedUpperHex w)
2019-06-25 17:09:26 -04:00
-- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 4 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 1022 as @03FE@.
word16PaddedUpperHex :: Word16 -> Builder
word16PaddedUpperHex w =
fromBounded Nat.constant (Bounded.word16PaddedUpperHex w)
-- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 4 digits. This uses lowercase for the alphabetical
-- digits. For example, this encodes the number 1022 as @03fe@.
word16PaddedLowerHex :: Word16 -> Builder
word16PaddedLowerHex w =
fromBounded Nat.constant (Bounded.word16PaddedLowerHex w)
-- | Encode a 16-bit unsigned integer as hexadecimal without leading
-- zeroes. This uses lowercase for the alphabetical digits. For
-- example, this encodes the number 1022 as @3fe@.
word16LowerHex :: Word16 -> Builder
word16LowerHex w =
fromBounded Nat.constant (Bounded.word16LowerHex w)
-- | Encode a 16-bit unsigned integer as hexadecimal without leading
-- zeroes. This uses uppercase for the alphabetical digits. For
-- example, this encodes the number 1022 as @3FE@.
word16UpperHex :: Word16 -> Builder
word16UpperHex w =
fromBounded Nat.constant (Bounded.word16UpperHex w)
-- | Encode a 16-bit unsigned integer as hexadecimal without leading
-- zeroes. This uses lowercase for the alphabetical digits. For
-- example, this encodes the number 1022 as @3FE@.
word8LowerHex :: Word8 -> Builder
word8LowerHex w =
fromBounded Nat.constant (Bounded.word8LowerHex w)
-- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 2 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 11 as @0B@.
word8PaddedUpperHex :: Word8 -> Builder
word8PaddedUpperHex w =
fromBounded Nat.constant (Bounded.word8PaddedUpperHex w)
-- | Encode an ASCII char.
-- Precondition: Input must be an ASCII character. This is not checked.
ascii :: Char -> Builder
ascii c = fromBounded Nat.constant (Bounded.ascii c)
-- | Encode an UTF8 char. This only uses as much space as is required.
char :: Char -> Builder
char c = fromBounded Nat.constant (Bounded.char c)
unST :: ST s a -> State# s -> (# State# s, a #)
unST (ST f) = f
shrinkMutableByteArray :: MutableByteArray s -> Int -> ST s ()
shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
primitive_ (Exts.shrinkMutableByteArray# arr sz)
-- | Requires exactly 8 bytes. Dump the octets of a 64-bit
-- word in a big-endian fashion.
word64BE :: Word64 -> Builder
word64BE w = fromBounded Nat.constant (Bounded.word64BE w)
-- | Requires exactly 4 bytes. Dump the octets of a 32-bit
-- word in a big-endian fashion.
word32BE :: Word32 -> Builder
word32BE w = fromBounded Nat.constant (Bounded.word32BE w)
-- | Requires exactly 2 bytes. Dump the octets of a 16-bit
-- word in a big-endian fashion.
word16BE :: Word16 -> Builder
word16BE w = fromBounded Nat.constant (Bounded.word16BE w)
2019-09-18 11:45:02 -04:00
-- | Requires exactly 1 byte.
word8 :: Word8 -> Builder
word8 w = fromBounded Nat.constant (Bounded.word8 w)
2019-09-18 11:45:02 -04:00
-- | Prefix a builder with its size in bytes. This size is
-- presented as a big-endian 32-bit word. The need to prefix
-- a builder with its length shows up a numbers of wire protocols
-- including those of PostgreSQL and Apache Kafka. Note the
-- equivalence:
--
-- > forall (n :: Int) (x :: Builder).
-- > let sz = sizeofByteArray (run n (consLength32BE x))
-- > consLength32BE x === word32BE (fromIntegral sz) <> x
--
-- However, using 'consLength32BE' is much more efficient here
-- since it only materializes the 'ByteArray' once.
consLength32BE :: Builder -> Builder
consLength32BE (Builder f) = Builder $ \arr off len s0 -> case len >=# 4# of
1# -> case f arr (off +# 4# ) (len -# 4# ) s0 of
(# s1, r #) -> case r of
(-1#) -> (# s1, (-1#) #)
_ ->
let ST g = UnsafeBounded.pasteST
(Bounded.word32BE (fromIntegral ((I# r - I# off) - 4)))
(MutableByteArray arr)
(I# off)
in case g s1 of
(# s2, _ #) -> (# s2, r #)
_ -> (# s0, (-1#) #)
2019-09-19 12:10:13 -04:00
-- | Prefix a builder with its size in bytes. This size is
-- presented as a big-endian 64-bit word. See 'consLength32BE'.
consLength64BE :: Builder -> Builder
consLength64BE (Builder f) = Builder $ \arr off len s0 -> case len >=# 8# of
1# -> case f arr (off +# 8# ) (len -# 8# ) s0 of
(# s1, r #) -> case r of
(-1#) -> (# s1, (-1#) #)
_ ->
let ST g = UnsafeBounded.pasteST
(Bounded.word64BE (fromIntegral ((I# r - I# off) - 8)))
(MutableByteArray arr)
(I# off)
in case g s1 of
(# s2, _ #) -> (# s2, r #)
_ -> (# s0, (-1#) #)
-- ShortText is already UTF-8 encoded. This is a no-op.
shortTextToByteArray :: ShortText -> ByteArray
shortTextToByteArray x = case TS.toShortByteString x of
SBS a -> ByteArray a
indexChar8Array :: ByteArray -> Int -> Char
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
c2w :: Char -> Word8
c2w = fromIntegral . ord