Add Data.Bytes.Builder.Unsafe.pasteUtf8TextJson#
This commit is contained in:
parent
7c86ace3d0
commit
0fcd93a9aa
4 changed files with 93 additions and 41 deletions
|
@ -5,6 +5,11 @@ Note: Prior to version 0.3.4.0, this library was named
|
|||
`small-bytearray-builder` is now just a compatibility shim
|
||||
to ease the migration process.
|
||||
|
||||
## 0.3.15.0 -- 2023-??-??
|
||||
|
||||
* Add `Data.Bytes.Builder.Unsafe.pasteUtf8TextJson#` for users who need
|
||||
to perform JSON string encoding without using a builder.
|
||||
|
||||
## 0.3.14.0 -- 2023-07-20
|
||||
|
||||
* Add `runOntoLength`.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
cabal-version: 2.2
|
||||
name: bytebuild
|
||||
version: 0.3.14.0
|
||||
version: 0.3.15.0
|
||||
synopsis: Build byte arrays
|
||||
description:
|
||||
This is similar to the builder facilities provided by
|
||||
|
|
|
@ -159,10 +159,10 @@ import Data.Bytes.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
|
|||
import Data.Bytes.Builder.Unsafe (commitsOntoChunks)
|
||||
import Data.Bytes.Builder.Unsafe (reverseCommitsOntoChunks)
|
||||
import Data.Bytes.Builder.Unsafe (stringUtf8,cstring,fromEffect)
|
||||
import Data.Bytes.Builder.Unsafe (pasteUtf8TextJson#)
|
||||
import Data.Bytes.Chunks (Chunks(ChunksCons,ChunksNil))
|
||||
import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes))
|
||||
import Data.ByteString.Short.Internal (ShortByteString(SBS))
|
||||
import Data.Char (ord)
|
||||
import Data.Foldable (foldlM)
|
||||
import Data.Int (Int64,Int32,Int16,Int8)
|
||||
import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..))
|
||||
|
@ -173,7 +173,7 @@ import Data.Word.Zigzag (toZigzagNative,toZigzag32,toZigzag64)
|
|||
import Foreign.C.String (CStringLen)
|
||||
import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
|
||||
import GHC.Exts (MutableByteArray#,Addr#,(*#),oneShot)
|
||||
import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#))
|
||||
import GHC.Exts (Int(I#),Int#,State#,ByteArray#,(>=#))
|
||||
import GHC.Exts (RealWorld,(+#),(-#),(<#))
|
||||
import GHC.Integer.Logarithms.Compat (integerLog2#)
|
||||
import GHC.IO (IO(IO),stToIO)
|
||||
|
@ -795,38 +795,18 @@ asWord8s (PrimArray x) = PrimArray x
|
|||
-- byte sequence is UTF-8 encoded text.
|
||||
slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
|
||||
{-# noinline slicedUtf8TextJson #-}
|
||||
slicedUtf8TextJson !src# !soff0# !slen0# = fromFunction reqLen $ \dst doff0 -> do
|
||||
PM.writeByteArray dst doff0 (c2w '"')
|
||||
let go !soff !slen !doff = if slen > 0
|
||||
then case indexChar8Array (ByteArray src#) soff of
|
||||
'\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
'\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
c -> if c >= '\x20'
|
||||
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
|
||||
else case c of
|
||||
'\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
'\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
'\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
_ -> do
|
||||
write2 dst doff '\\' 'u'
|
||||
doff' <- UnsafeBounded.pasteST
|
||||
(Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
|
||||
dst (doff + 2)
|
||||
go (soff + 1) (slen - 1) doff'
|
||||
else pure doff
|
||||
doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
|
||||
PM.writeByteArray dst doffRes (c2w '"')
|
||||
pure (doffRes + 1)
|
||||
slicedUtf8TextJson !src# !soff0# !slen0# = fromFunction# reqLen#
|
||||
( \dst# doff0# s0# -> pasteUtf8TextJson# src# soff0# slen0# dst# doff0# s0# )
|
||||
where
|
||||
slen0 = I# slen0#
|
||||
-- We multiply by 6 because, in the worst case, everything might be in the
|
||||
-- unprintable ASCII range. The plus 2 is for the quotes on the ends.
|
||||
reqLen = (6 * slen0) + 2
|
||||
!reqLen# = (6# *# slen0# ) +# 2#
|
||||
|
||||
-- | Constructor for 'Builder' that works on a function with lifted
|
||||
-- arguments instead of unlifted ones. This is just as unsafe as the
|
||||
-- actual constructor.
|
||||
fromFunction :: Int -> (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder
|
||||
{-# inline fromFunction #-}
|
||||
fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 ->
|
||||
let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
|
||||
1# -> (# s0, buf0, off0, len0, cs0 #)
|
||||
|
@ -837,11 +817,17 @@ fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 ->
|
|||
in case unST (f (MutableByteArray buf1) (I# off1)) s1 of
|
||||
(# s2, I# off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
|
||||
|
||||
-- Internal. Write two characters in the ASCII plane to a byte array.
|
||||
write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
|
||||
write2 marr ix a b = do
|
||||
PM.writeByteArray marr ix (c2w a)
|
||||
PM.writeByteArray marr (ix + 1) (c2w b)
|
||||
fromFunction# :: Int# -> (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #) ) -> Builder
|
||||
{-# inline fromFunction# #-}
|
||||
fromFunction# req f = Builder $ \buf0 off0 len0 cs0 s0 ->
|
||||
let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
|
||||
1# -> (# s0, buf0, off0, len0, cs0 #)
|
||||
_ -> let !(I# lenX) = max 4080 (I# req) in
|
||||
case Exts.newByteArray# lenX s0 of
|
||||
(# sX, bufX #) ->
|
||||
(# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
|
||||
in case f buf1 off1 s1 of
|
||||
(# s2, off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
|
||||
|
||||
-- | Create a builder from text. The text will be UTF-8 encoded.
|
||||
shortTextUtf8 :: ShortText -> Builder
|
||||
|
@ -864,6 +850,7 @@ textUtf8 (I.Text (A.ByteArray b) off len) =
|
|||
-- * @\\_"_\/ ==\> "\\\\_\\"_\/"@ (escapes backslashes and quotes)
|
||||
-- * @hello\<ESC\>world ==> "hello\\u001Bworld"@ (where @\<ESC\>@ is code point 0x1B)
|
||||
shortTextJsonString :: ShortText -> Builder
|
||||
{-# inline shortTextJsonString #-}
|
||||
shortTextJsonString a =
|
||||
let !(ByteArray ba) = shortTextToByteArray a
|
||||
!(I# len) = PM.sizeofByteArray (ByteArray ba)
|
||||
|
@ -1204,23 +1191,20 @@ shortTextToByteArray :: ShortText -> ByteArray
|
|||
shortTextToByteArray x = case TS.toShortByteString x of
|
||||
SBS a -> ByteArray a
|
||||
|
||||
indexChar8Array :: ByteArray -> Int -> Char
|
||||
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
|
||||
|
||||
c2w :: Char -> Word8
|
||||
c2w = fromIntegral . ord
|
||||
|
||||
-- | Encode a signed machine-sized integer with LEB-128. This uses
|
||||
-- zig-zag encoding.
|
||||
intLEB128 :: Int -> Builder
|
||||
{-# inline intLEB128 #-}
|
||||
intLEB128 = wordLEB128 . toZigzagNative
|
||||
|
||||
-- | Encode a 32-bit signed integer with LEB-128. This uses zig-zag encoding.
|
||||
int32LEB128 :: Int32 -> Builder
|
||||
{-# inline int32LEB128 #-}
|
||||
int32LEB128 = word32LEB128 . toZigzag32
|
||||
|
||||
-- | Encode a 64-bit signed integer with LEB-128. This uses zig-zag encoding.
|
||||
int64LEB128 :: Int64 -> Builder
|
||||
{-# inline int64LEB128 #-}
|
||||
int64LEB128 = word64LEB128 . toZigzag64
|
||||
|
||||
-- | Encode a machine-sized word with LEB-128.
|
||||
|
|
|
@ -33,22 +33,25 @@ module Data.Bytes.Builder.Unsafe
|
|||
-- @Data.Bytes.Builder@ instead.
|
||||
, stringUtf8
|
||||
, cstring
|
||||
-- * Pasting with Preconditions
|
||||
, pasteUtf8TextJson#
|
||||
) where
|
||||
|
||||
import Control.Monad.Primitive (primitive_)
|
||||
import Data.Bytes.Chunks (Chunks(ChunksCons))
|
||||
import Data.Bytes.Types (Bytes(Bytes))
|
||||
import Data.Char (ord)
|
||||
import Data.Primitive (MutableByteArray(..),ByteArray(..))
|
||||
import Data.Word (Word8)
|
||||
import Foreign.C.String (CString)
|
||||
import GHC.Base (unpackCString#,unpackCStringUtf8#)
|
||||
import GHC.Exts ((-#),(+#),(>#),(>=#))
|
||||
import GHC.Exts ((-#),(+#),(>#),(>=#),Char(C#))
|
||||
import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr))
|
||||
import GHC.Exts (RealWorld,IsString,Int#,State#)
|
||||
import GHC.ST (ST(ST))
|
||||
import GHC.IO (stToIO)
|
||||
import GHC.ST (ST(ST))
|
||||
|
||||
import qualified Compat as C
|
||||
|
||||
import qualified Data.Bytes.Builder.Bounded as Bounded
|
||||
import qualified Data.Bytes.Builder.Bounded.Unsafe as UnsafeBounded
|
||||
import qualified Data.Primitive as PM
|
||||
|
@ -311,3 +314,63 @@ commitDistance target !n (Mutable buf len cs) =
|
|||
case Exts.sameMutableByteArray# target buf of
|
||||
1# -> n +# len
|
||||
_ -> commitDistance target (n +# len) cs
|
||||
|
||||
-- | Encode (UTF-8 encoded) text as a JSON string, wrapping it in double quotes.
|
||||
-- This escapes all characters with code points below @0x20@.
|
||||
--
|
||||
-- * Precondition: The slice of the byte argument is UTF-8 encoded text.
|
||||
-- * Precondition: There is enough space in the buffer for the result
|
||||
-- to be written to. A simple way to ensure enough space is to allocate
|
||||
-- @3N + 2@ bytes, where N is the length of the argument. However, the
|
||||
-- caller may use clever heuristics to find a lower upper bound.
|
||||
-- * Result: The next offset in the destination buffer
|
||||
pasteUtf8TextJson# ::
|
||||
ByteArray# -- ^ source
|
||||
-> Int# -- ^ source offset
|
||||
-> Int# -- ^ source length
|
||||
-> MutableByteArray# s -- ^ destination buffer
|
||||
-> Int# -- ^ offset into destination buffer
|
||||
-> State# s -- ^ state token
|
||||
-> (# State# s, Int# #) -- returns next destination offset
|
||||
{-# noinline pasteUtf8TextJson# #-}
|
||||
pasteUtf8TextJson# src# soff0# slen0# dst# doff0# s0# =
|
||||
let ST f = do
|
||||
let dst = MutableByteArray dst#
|
||||
let doff0 = I# doff0#
|
||||
PM.writeByteArray dst doff0 (c2w '"')
|
||||
let go !soff !slen !doff = if slen > 0
|
||||
then case indexChar8Array (ByteArray src#) soff of
|
||||
'\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
'\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
c -> if c >= '\x20'
|
||||
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
|
||||
else case c of
|
||||
'\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
'\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
'\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
_ -> do
|
||||
write2 dst doff '\\' 'u'
|
||||
doff' <- UnsafeBounded.pasteST
|
||||
(Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
|
||||
dst (doff + 2)
|
||||
go (soff + 1) (slen - 1) doff'
|
||||
else pure doff
|
||||
doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
|
||||
PM.writeByteArray dst doffRes (c2w '"')
|
||||
pure (doffRes + 1)
|
||||
!(# !s1, I# dstFinal #) = f s0#
|
||||
in (# s1, dstFinal #)
|
||||
|
||||
c2w :: Char -> Word8
|
||||
{-# inline c2w #-}
|
||||
c2w = fromIntegral . ord
|
||||
|
||||
-- Internal. Write two characters in the ASCII plane to a byte array.
|
||||
write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
|
||||
write2 marr ix a b = do
|
||||
PM.writeByteArray marr ix (c2w a)
|
||||
PM.writeByteArray marr (ix + 1) (c2w b)
|
||||
|
||||
indexChar8Array :: ByteArray -> Int -> Char
|
||||
{-# inline indexChar8Array #-}
|
||||
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
|
||||
|
|
Loading…
Reference in a new issue