Add Data.Bytes.Builder.Unsafe.pasteUtf8TextJson#

This commit is contained in:
Andrew Martin 2023-12-22 14:07:53 -05:00
parent 7c86ace3d0
commit 0fcd93a9aa
4 changed files with 93 additions and 41 deletions

View file

@ -5,6 +5,11 @@ Note: Prior to version 0.3.4.0, this library was named
`small-bytearray-builder` is now just a compatibility shim `small-bytearray-builder` is now just a compatibility shim
to ease the migration process. to ease the migration process.
## 0.3.15.0 -- 2023-??-??
* Add `Data.Bytes.Builder.Unsafe.pasteUtf8TextJson#` for users who need
to perform JSON string encoding without using a builder.
## 0.3.14.0 -- 2023-07-20 ## 0.3.14.0 -- 2023-07-20
* Add `runOntoLength`. * Add `runOntoLength`.

View file

@ -1,6 +1,6 @@
cabal-version: 2.2 cabal-version: 2.2
name: bytebuild name: bytebuild
version: 0.3.14.0 version: 0.3.15.0
synopsis: Build byte arrays synopsis: Build byte arrays
description: description:
This is similar to the builder facilities provided by This is similar to the builder facilities provided by

View file

@ -159,10 +159,10 @@ import Data.Bytes.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
import Data.Bytes.Builder.Unsafe (commitsOntoChunks) import Data.Bytes.Builder.Unsafe (commitsOntoChunks)
import Data.Bytes.Builder.Unsafe (reverseCommitsOntoChunks) import Data.Bytes.Builder.Unsafe (reverseCommitsOntoChunks)
import Data.Bytes.Builder.Unsafe (stringUtf8,cstring,fromEffect) import Data.Bytes.Builder.Unsafe (stringUtf8,cstring,fromEffect)
import Data.Bytes.Builder.Unsafe (pasteUtf8TextJson#)
import Data.Bytes.Chunks (Chunks(ChunksCons,ChunksNil)) import Data.Bytes.Chunks (Chunks(ChunksCons,ChunksNil))
import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes)) import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes))
import Data.ByteString.Short.Internal (ShortByteString(SBS)) import Data.ByteString.Short.Internal (ShortByteString(SBS))
import Data.Char (ord)
import Data.Foldable (foldlM) import Data.Foldable (foldlM)
import Data.Int (Int64,Int32,Int16,Int8) import Data.Int (Int64,Int32,Int16,Int8)
import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..)) import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..))
@ -173,7 +173,7 @@ import Data.Word.Zigzag (toZigzagNative,toZigzag32,toZigzag64)
import Foreign.C.String (CStringLen) import Foreign.C.String (CStringLen)
import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder) import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
import GHC.Exts (MutableByteArray#,Addr#,(*#),oneShot) import GHC.Exts (MutableByteArray#,Addr#,(*#),oneShot)
import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#)) import GHC.Exts (Int(I#),Int#,State#,ByteArray#,(>=#))
import GHC.Exts (RealWorld,(+#),(-#),(<#)) import GHC.Exts (RealWorld,(+#),(-#),(<#))
import GHC.Integer.Logarithms.Compat (integerLog2#) import GHC.Integer.Logarithms.Compat (integerLog2#)
import GHC.IO (IO(IO),stToIO) import GHC.IO (IO(IO),stToIO)
@ -795,38 +795,18 @@ asWord8s (PrimArray x) = PrimArray x
-- byte sequence is UTF-8 encoded text. -- byte sequence is UTF-8 encoded text.
slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
{-# noinline slicedUtf8TextJson #-} {-# noinline slicedUtf8TextJson #-}
slicedUtf8TextJson !src# !soff0# !slen0# = fromFunction reqLen $ \dst doff0 -> do slicedUtf8TextJson !src# !soff0# !slen0# = fromFunction# reqLen#
PM.writeByteArray dst doff0 (c2w '"') ( \dst# doff0# s0# -> pasteUtf8TextJson# src# soff0# slen0# dst# doff0# s0# )
let go !soff !slen !doff = if slen > 0
then case indexChar8Array (ByteArray src#) soff of
'\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
'\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
c -> if c >= '\x20'
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
else case c of
'\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
'\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
'\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
_ -> do
write2 dst doff '\\' 'u'
doff' <- UnsafeBounded.pasteST
(Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
dst (doff + 2)
go (soff + 1) (slen - 1) doff'
else pure doff
doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
PM.writeByteArray dst doffRes (c2w '"')
pure (doffRes + 1)
where where
slen0 = I# slen0#
-- We multiply by 6 because, in the worst case, everything might be in the -- We multiply by 6 because, in the worst case, everything might be in the
-- unprintable ASCII range. The plus 2 is for the quotes on the ends. -- unprintable ASCII range. The plus 2 is for the quotes on the ends.
reqLen = (6 * slen0) + 2 !reqLen# = (6# *# slen0# ) +# 2#
-- | Constructor for 'Builder' that works on a function with lifted -- | Constructor for 'Builder' that works on a function with lifted
-- arguments instead of unlifted ones. This is just as unsafe as the -- arguments instead of unlifted ones. This is just as unsafe as the
-- actual constructor. -- actual constructor.
fromFunction :: Int -> (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder fromFunction :: Int -> (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder
{-# inline fromFunction #-}
fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 -> fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 ->
let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
1# -> (# s0, buf0, off0, len0, cs0 #) 1# -> (# s0, buf0, off0, len0, cs0 #)
@ -837,11 +817,17 @@ fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 ->
in case unST (f (MutableByteArray buf1) (I# off1)) s1 of in case unST (f (MutableByteArray buf1) (I# off1)) s1 of
(# s2, I# off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #) (# s2, I# off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
-- Internal. Write two characters in the ASCII plane to a byte array. fromFunction# :: Int# -> (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #) ) -> Builder
write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s () {-# inline fromFunction# #-}
write2 marr ix a b = do fromFunction# req f = Builder $ \buf0 off0 len0 cs0 s0 ->
PM.writeByteArray marr ix (c2w a) let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
PM.writeByteArray marr (ix + 1) (c2w b) 1# -> (# s0, buf0, off0, len0, cs0 #)
_ -> let !(I# lenX) = max 4080 (I# req) in
case Exts.newByteArray# lenX s0 of
(# sX, bufX #) ->
(# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
in case f buf1 off1 s1 of
(# s2, off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
-- | Create a builder from text. The text will be UTF-8 encoded. -- | Create a builder from text. The text will be UTF-8 encoded.
shortTextUtf8 :: ShortText -> Builder shortTextUtf8 :: ShortText -> Builder
@ -864,6 +850,7 @@ textUtf8 (I.Text (A.ByteArray b) off len) =
-- * @\\_"_\/ ==\> "\\\\_\\"_\/"@ (escapes backslashes and quotes) -- * @\\_"_\/ ==\> "\\\\_\\"_\/"@ (escapes backslashes and quotes)
-- * @hello\<ESC\>world ==> "hello\\u001Bworld"@ (where @\<ESC\>@ is code point 0x1B) -- * @hello\<ESC\>world ==> "hello\\u001Bworld"@ (where @\<ESC\>@ is code point 0x1B)
shortTextJsonString :: ShortText -> Builder shortTextJsonString :: ShortText -> Builder
{-# inline shortTextJsonString #-}
shortTextJsonString a = shortTextJsonString a =
let !(ByteArray ba) = shortTextToByteArray a let !(ByteArray ba) = shortTextToByteArray a
!(I# len) = PM.sizeofByteArray (ByteArray ba) !(I# len) = PM.sizeofByteArray (ByteArray ba)
@ -1204,23 +1191,20 @@ shortTextToByteArray :: ShortText -> ByteArray
shortTextToByteArray x = case TS.toShortByteString x of shortTextToByteArray x = case TS.toShortByteString x of
SBS a -> ByteArray a SBS a -> ByteArray a
indexChar8Array :: ByteArray -> Int -> Char
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
c2w :: Char -> Word8
c2w = fromIntegral . ord
-- | Encode a signed machine-sized integer with LEB-128. This uses -- | Encode a signed machine-sized integer with LEB-128. This uses
-- zig-zag encoding. -- zig-zag encoding.
intLEB128 :: Int -> Builder intLEB128 :: Int -> Builder
{-# inline intLEB128 #-}
intLEB128 = wordLEB128 . toZigzagNative intLEB128 = wordLEB128 . toZigzagNative
-- | Encode a 32-bit signed integer with LEB-128. This uses zig-zag encoding. -- | Encode a 32-bit signed integer with LEB-128. This uses zig-zag encoding.
int32LEB128 :: Int32 -> Builder int32LEB128 :: Int32 -> Builder
{-# inline int32LEB128 #-}
int32LEB128 = word32LEB128 . toZigzag32 int32LEB128 = word32LEB128 . toZigzag32
-- | Encode a 64-bit signed integer with LEB-128. This uses zig-zag encoding. -- | Encode a 64-bit signed integer with LEB-128. This uses zig-zag encoding.
int64LEB128 :: Int64 -> Builder int64LEB128 :: Int64 -> Builder
{-# inline int64LEB128 #-}
int64LEB128 = word64LEB128 . toZigzag64 int64LEB128 = word64LEB128 . toZigzag64
-- | Encode a machine-sized word with LEB-128. -- | Encode a machine-sized word with LEB-128.

View file

@ -33,22 +33,25 @@ module Data.Bytes.Builder.Unsafe
-- @Data.Bytes.Builder@ instead. -- @Data.Bytes.Builder@ instead.
, stringUtf8 , stringUtf8
, cstring , cstring
-- * Pasting with Preconditions
, pasteUtf8TextJson#
) where ) where
import Control.Monad.Primitive (primitive_) import Control.Monad.Primitive (primitive_)
import Data.Bytes.Chunks (Chunks(ChunksCons)) import Data.Bytes.Chunks (Chunks(ChunksCons))
import Data.Bytes.Types (Bytes(Bytes)) import Data.Bytes.Types (Bytes(Bytes))
import Data.Char (ord)
import Data.Primitive (MutableByteArray(..),ByteArray(..)) import Data.Primitive (MutableByteArray(..),ByteArray(..))
import Data.Word (Word8)
import Foreign.C.String (CString) import Foreign.C.String (CString)
import GHC.Base (unpackCString#,unpackCStringUtf8#) import GHC.Base (unpackCString#,unpackCStringUtf8#)
import GHC.Exts ((-#),(+#),(>#),(>=#)) import GHC.Exts ((-#),(+#),(>#),(>=#),Char(C#))
import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr)) import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr))
import GHC.Exts (RealWorld,IsString,Int#,State#) import GHC.Exts (RealWorld,IsString,Int#,State#)
import GHC.ST (ST(ST))
import GHC.IO (stToIO) import GHC.IO (stToIO)
import GHC.ST (ST(ST))
import qualified Compat as C import qualified Compat as C
import qualified Data.Bytes.Builder.Bounded as Bounded import qualified Data.Bytes.Builder.Bounded as Bounded
import qualified Data.Bytes.Builder.Bounded.Unsafe as UnsafeBounded import qualified Data.Bytes.Builder.Bounded.Unsafe as UnsafeBounded
import qualified Data.Primitive as PM import qualified Data.Primitive as PM
@ -311,3 +314,63 @@ commitDistance target !n (Mutable buf len cs) =
case Exts.sameMutableByteArray# target buf of case Exts.sameMutableByteArray# target buf of
1# -> n +# len 1# -> n +# len
_ -> commitDistance target (n +# len) cs _ -> commitDistance target (n +# len) cs
-- | Encode (UTF-8 encoded) text as a JSON string, wrapping it in double quotes.
-- This escapes all characters with code points below @0x20@.
--
-- * Precondition: The slice of the byte argument is UTF-8 encoded text.
-- * Precondition: There is enough space in the buffer for the result
-- to be written to. A simple way to ensure enough space is to allocate
-- @3N + 2@ bytes, where N is the length of the argument. However, the
-- caller may use clever heuristics to find a lower upper bound.
-- * Result: The next offset in the destination buffer
pasteUtf8TextJson# ::
ByteArray# -- ^ source
-> Int# -- ^ source offset
-> Int# -- ^ source length
-> MutableByteArray# s -- ^ destination buffer
-> Int# -- ^ offset into destination buffer
-> State# s -- ^ state token
-> (# State# s, Int# #) -- returns next destination offset
{-# noinline pasteUtf8TextJson# #-}
pasteUtf8TextJson# src# soff0# slen0# dst# doff0# s0# =
let ST f = do
let dst = MutableByteArray dst#
let doff0 = I# doff0#
PM.writeByteArray dst doff0 (c2w '"')
let go !soff !slen !doff = if slen > 0
then case indexChar8Array (ByteArray src#) soff of
'\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
'\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
c -> if c >= '\x20'
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
else case c of
'\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
'\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
'\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
_ -> do
write2 dst doff '\\' 'u'
doff' <- UnsafeBounded.pasteST
(Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
dst (doff + 2)
go (soff + 1) (slen - 1) doff'
else pure doff
doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
PM.writeByteArray dst doffRes (c2w '"')
pure (doffRes + 1)
!(# !s1, I# dstFinal #) = f s0#
in (# s1, dstFinal #)
c2w :: Char -> Word8
{-# inline c2w #-}
c2w = fromIntegral . ord
-- Internal. Write two characters in the ASCII plane to a byte array.
write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
write2 marr ix a b = do
PM.writeByteArray marr ix (c2w a)
PM.writeByteArray marr (ix + 1) (c2w b)
indexChar8Array :: ByteArray -> Int -> Char
{-# inline indexChar8Array #-}
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)