Add Data.Bytes.Builder.Unsafe.pasteUtf8TextJson#

2023-12-22 14:07:53 -05:00 · 2023-12-22 14:07:53 -05:00 · 0fcd93a9aa
commit 0fcd93a9aa
parent 7c86ace3d0
4 changed files with 93 additions and 41 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,11 @@ Note: Prior to version 0.3.4.0, this library was named
 `small-bytearray-builder` is now just a compatibility shim
 to ease the migration process.
 ## 0.3.15.0 -- 2023-??-??
 * Add `Data.Bytes.Builder.Unsafe.pasteUtf8TextJson#` for users who need
  to perform JSON string encoding without using a builder.
 ## 0.3.14.0 -- 2023-07-20
 * Add `runOntoLength`.
--- a/bytebuild.cabal
+++ b/bytebuild.cabal
@ -1,6 +1,6 @@
 cabal-version: 2.2
 name: bytebuild
-version: 0.3.14.0
+version: 0.3.15.0
 synopsis: Build byte arrays
 description:
  This is similar to the builder facilities provided by
--- a/src/Data/Bytes/Builder.hs
+++ b/src/Data/Bytes/Builder.hs
@ -159,10 +159,10 @@ import Data.Bytes.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
 import Data.Bytes.Builder.Unsafe (commitsOntoChunks)
 import Data.Bytes.Builder.Unsafe (reverseCommitsOntoChunks)
 import Data.Bytes.Builder.Unsafe (stringUtf8,cstring,fromEffect)
 import Data.Bytes.Builder.Unsafe (pasteUtf8TextJson#)
 import Data.Bytes.Chunks (Chunks(ChunksCons,ChunksNil))
 import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes))
 import Data.ByteString.Short.Internal (ShortByteString(SBS))
 import Data.Char (ord)
 import Data.Foldable (foldlM)
 import Data.Int (Int64,Int32,Int16,Int8)
 import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..))
@ -173,7 +173,7 @@ import Data.Word.Zigzag (toZigzagNative,toZigzag32,toZigzag64)
 import Foreign.C.String (CStringLen)
 import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
 import GHC.Exts (MutableByteArray#,Addr#,(*#),oneShot)
-import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#))
+import GHC.Exts (Int(I#),Int#,State#,ByteArray#,(>=#))
 import GHC.Exts (RealWorld,(+#),(-#),(<#))
 import GHC.Integer.Logarithms.Compat (integerLog2#)
 import GHC.IO (IO(IO),stToIO)
@ -795,38 +795,18 @@ asWord8s (PrimArray x) = PrimArray x
 -- byte sequence is UTF-8 encoded text.
 slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
 {-# noinline slicedUtf8TextJson #-}
-slicedUtf8TextJson !src# !soff0# !slen0# = fromFunction reqLen $ \dst doff0 -> do
+slicedUtf8TextJson !src# !soff0# !slen0# = fromFunction# reqLen#
-  PM.writeByteArray dst doff0 (c2w '"')
+  ( \dst# doff0# s0# -> pasteUtf8TextJson# src# soff0# slen0# dst# doff0# s0# )
  let go !soff !slen !doff = if slen > 0
        then case indexChar8Array (ByteArray src#) soff of
          '\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
          '\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
          c -> if c >= '\x20'
            then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
            else case c of
              '\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
              '\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
              '\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
              _ -> do
                write2 dst doff '\\' 'u'
                doff' <- UnsafeBounded.pasteST
                  (Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
                  dst (doff + 2)
                go (soff + 1) (slen - 1) doff'
        else pure doff
  doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
  PM.writeByteArray dst doffRes (c2w '"')
  pure (doffRes + 1)
  where
  slen0 = I# slen0#
  -- We multiply by 6 because, in the worst case, everything might be in the
  -- unprintable ASCII range. The plus 2 is for the quotes on the ends.
-  reqLen = (6 * slen0) + 2
+  !reqLen# = (6# *# slen0# ) +# 2#
 -- | Constructor for 'Builder' that works on a function with lifted
 -- arguments instead of unlifted ones. This is just as unsafe as the
 -- actual constructor.
 fromFunction :: Int -> (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder
 {-# inline fromFunction #-}
 fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 ->
  let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
        1# -> (# s0, buf0, off0, len0, cs0 #)
@ -837,11 +817,17 @@ fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 ->
   in case unST (f (MutableByteArray buf1) (I# off1)) s1 of
        (# s2, I# off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
-- Internal. Write two characters in the ASCII plane to a byte array.
+fromFunction# :: Int# -> (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #) ) -> Builder
-write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
+{-# inline fromFunction# #-}
-write2 marr ix a b = do
+fromFunction# req f = Builder $ \buf0 off0 len0 cs0 s0 ->
-  PM.writeByteArray marr ix (c2w a)
+  let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
-  PM.writeByteArray marr (ix + 1) (c2w b)
+        1# -> (# s0, buf0, off0, len0, cs0 #)
        _ -> let !(I# lenX) = max 4080 (I# req) in
          case Exts.newByteArray# lenX s0 of
            (# sX, bufX #) ->
              (# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
   in case f buf1 off1 s1 of
        (# s2, off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)
 -- | Create a builder from text. The text will be UTF-8 encoded.
 shortTextUtf8 :: ShortText -> Builder
@ -864,6 +850,7 @@ textUtf8 (I.Text (A.ByteArray b) off len) =
 -- * @\\_"_\/ ==\> "\\\\_\\"_\/"@ (escapes backslashes and quotes)
 -- * @hello\<ESC\>world ==> "hello\\u001Bworld"@ (where @\<ESC\>@ is code point 0x1B)
 shortTextJsonString :: ShortText -> Builder
 {-# inline shortTextJsonString #-}
 shortTextJsonString a =
  let !(ByteArray ba) = shortTextToByteArray a
      !(I# len) = PM.sizeofByteArray (ByteArray ba)
@ -1204,23 +1191,20 @@ shortTextToByteArray :: ShortText -> ByteArray
 shortTextToByteArray x = case TS.toShortByteString x of
  SBS a -> ByteArray a
 indexChar8Array :: ByteArray -> Int -> Char
 indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
 c2w :: Char -> Word8
 c2w = fromIntegral . ord
 -- | Encode a signed machine-sized integer with LEB-128. This uses
 -- zig-zag encoding.
 intLEB128 :: Int -> Builder
 {-# inline intLEB128 #-}
 intLEB128 = wordLEB128 . toZigzagNative
 -- | Encode a 32-bit signed integer with LEB-128. This uses zig-zag encoding.
 int32LEB128 :: Int32 -> Builder
 {-# inline int32LEB128 #-}
 int32LEB128 = word32LEB128 . toZigzag32
 -- | Encode a 64-bit signed integer with LEB-128. This uses zig-zag encoding.
 int64LEB128 :: Int64 -> Builder
 {-# inline int64LEB128 #-}
 int64LEB128 = word64LEB128 . toZigzag64
 -- | Encode a machine-sized word with LEB-128.
--- a/src/Data/Bytes/Builder/Unsafe.hs
+++ b/src/Data/Bytes/Builder/Unsafe.hs
@ -33,22 +33,25 @@ module Data.Bytes.Builder.Unsafe
    -- @Data.Bytes.Builder@ instead.
  , stringUtf8
  , cstring
    -- * Pasting with Preconditions
  , pasteUtf8TextJson#
  ) where
 import Control.Monad.Primitive (primitive_)
 import Data.Bytes.Chunks (Chunks(ChunksCons))
 import Data.Bytes.Types (Bytes(Bytes))
 import Data.Char (ord)
 import Data.Primitive (MutableByteArray(..),ByteArray(..))
 import Data.Word (Word8)
 import Foreign.C.String (CString)
 import GHC.Base (unpackCString#,unpackCStringUtf8#)
-import GHC.Exts ((-#),(+#),(>#),(>=#))
+import GHC.Exts ((-#),(+#),(>#),(>=#),Char(C#))
 import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr))
 import GHC.Exts (RealWorld,IsString,Int#,State#)
 import GHC.ST (ST(ST))
 import GHC.IO (stToIO)
 import GHC.ST (ST(ST))
 import qualified Compat as C
 import qualified Data.Bytes.Builder.Bounded as Bounded
 import qualified Data.Bytes.Builder.Bounded.Unsafe as UnsafeBounded
 import qualified Data.Primitive as PM
@ -311,3 +314,63 @@ commitDistance target !n (Mutable buf len cs) =
  case Exts.sameMutableByteArray# target buf of
    1# -> n +# len
    _ -> commitDistance target (n +# len) cs
 -- | Encode (UTF-8 encoded) text as a JSON string, wrapping it in double quotes.
 -- This escapes all characters with code points below @0x20@.
 --
 -- * Precondition: The slice of the byte argument is UTF-8 encoded text.
 -- * Precondition: There is enough space in the buffer for the result
 --   to be written to. A simple way to ensure enough space is to allocate
 --   @3N + 2@ bytes, where N is the length of the argument. However, the
 --   caller may use clever heuristics to find a lower upper bound.
 -- * Result: The next offset in the destination buffer
 pasteUtf8TextJson# ::
     ByteArray# -- ^ source
  -> Int# -- ^ source offset
  -> Int# -- ^ source length
  -> MutableByteArray# s -- ^ destination buffer
  -> Int# -- ^ offset into destination buffer
  -> State# s -- ^ state token
  -> (# State# s, Int# #) -- returns next destination offset
 {-# noinline pasteUtf8TextJson# #-}
 pasteUtf8TextJson# src# soff0# slen0# dst# doff0# s0# =
  let ST f = do
        let dst = MutableByteArray dst#
        let doff0 = I# doff0#
        PM.writeByteArray dst doff0 (c2w '"')
        let go !soff !slen !doff = if slen > 0
              then case indexChar8Array (ByteArray src#) soff of
                '\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
                '\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
                c -> if c >= '\x20'
                  then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
                  else case c of
                    '\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
                    '\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
                    '\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
                    _ -> do
                      write2 dst doff '\\' 'u'
                      doff' <- UnsafeBounded.pasteST
                        (Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
                        dst (doff + 2)
                      go (soff + 1) (slen - 1) doff'
              else pure doff
        doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
        PM.writeByteArray dst doffRes (c2w '"')
        pure (doffRes + 1)
      !(# !s1, I# dstFinal #) = f s0#
   in (# s1, dstFinal #)
 c2w :: Char -> Word8
 {-# inline c2w #-}
 c2w = fromIntegral . ord
 -- Internal. Write two characters in the ASCII plane to a byte array.
 write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
 write2 marr ix a b = do
  PM.writeByteArray marr ix (c2w a)
  PM.writeByteArray marr (ix + 1) (c2w b)
 indexChar8Array :: ByteArray -> Int -> Char
 {-# inline indexChar8Array #-}
 indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)