Add Data.Bytes.Builder.Unsafe.pasteUtf8TextJson#

2023-12-22 14:07:53 -05:00 · 2023-12-22 14:07:53 -05:00 · 0fcd93a9aa
commit 0fcd93a9aa
parent 7c86ace3d0
4 changed files with 93 additions and 41 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,11 @@ Note: Prior to version 0.3.4.0, this library was named
 `small-bytearray-builder` is now just a compatibility shim
 to ease the migration process.

+## 0.3.15.0 -- 2023-??-??
+
+* Add `Data.Bytes.Builder.Unsafe.pasteUtf8TextJson#` for users who need
+  to perform JSON string encoding without using a builder.
+
 ## 0.3.14.0 -- 2023-07-20

 * Add `runOntoLength`.
--- a/bytebuild.cabal
+++ b/bytebuild.cabal
@ -1,6 +1,6 @@
 cabal-version: 2.2
 name: bytebuild
-version: 0.3.14.0
+version: 0.3.15.0
 synopsis: Build byte arrays
 description:
  This is similar to the builder facilities provided by
--- a/src/Data/Bytes/Builder.hs
+++ b/src/Data/Bytes/Builder.hs
@ -159,10 +159,10 @@ import Data.Bytes.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
 import Data.Bytes.Builder.Unsafe (commitsOntoChunks)
 import Data.Bytes.Builder.Unsafe (reverseCommitsOntoChunks)
 import Data.Bytes.Builder.Unsafe (stringUtf8,cstring,fromEffect)
+import Data.Bytes.Builder.Unsafe (pasteUtf8TextJson#)
 import Data.Bytes.Chunks (Chunks(ChunksCons,ChunksNil))
 import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes))
 import Data.ByteString.Short.Internal (ShortByteString(SBS))
-import Data.Char (ord)
 import Data.Foldable (foldlM)
 import Data.Int (Int64,Int32,Int16,Int8)
 import Data.Primitive (ByteArray(..),MutableByteArray(..),PrimArray(..))
@ -173,7 +173,7 @@ import Data.Word.Zigzag (toZigzagNative,toZigzag32,toZigzag64)
 import Foreign.C.String (CStringLen)
 import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
 import GHC.Exts (MutableByteArray#,Addr#,(*#),oneShot)
-import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#))
+import GHC.Exts (Int(I#),Int#,State#,ByteArray#,(>=#))
 import GHC.Exts (RealWorld,(+#),(-#),(<#))
 import GHC.Integer.Logarithms.Compat (integerLog2#)
 import GHC.IO (IO(IO),stToIO)
@ -795,38 +795,18 @@ asWord8s (PrimArray x) = PrimArray x
 -- byte sequence is UTF-8 encoded text.
 slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
 {-# noinline slicedUtf8TextJson #-}
-slicedUtf8TextJson !src# !soff0# !slen0# = fromFunction reqLen $ \dst doff0 -> do
-  PM.writeByteArray dst doff0 (c2w '"')
-  let go !soff !slen !doff = if slen > 0
-        then case indexChar8Array (ByteArray src#) soff of
-          '\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
-          '\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
-          c -> if c >= '\x20'
-            then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
-            else case c of
-              '\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
-              '\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
-              '\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
-              _ -> do
-                write2 dst doff '\\' 'u'
-                doff' <- UnsafeBounded.pasteST
-                  (Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
-                  dst (doff + 2)
-                go (soff + 1) (slen - 1) doff'
-        else pure doff
-  doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
-  PM.writeByteArray dst doffRes (c2w '"')
-  pure (doffRes + 1)
+slicedUtf8TextJson !src# !soff0# !slen0# = fromFunction# reqLen#
+  ( \dst# doff0# s0# -> pasteUtf8TextJson# src# soff0# slen0# dst# doff0# s0# )
  where
-  slen0 = I# slen0#
  -- We multiply by 6 because, in the worst case, everything might be in the
  -- unprintable ASCII range. The plus 2 is for the quotes on the ends.
-  reqLen = (6 * slen0) + 2
+  !reqLen# = (6# *# slen0# ) +# 2#

 -- | Constructor for 'Builder' that works on a function with lifted
 -- arguments instead of unlifted ones. This is just as unsafe as the
 -- actual constructor.
 fromFunction :: Int -> (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder
+{-# inline fromFunction #-}
 fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 ->
  let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
        1# -> (# s0, buf0, off0, len0, cs0 #)
@ -837,11 +817,17 @@ fromFunction (I# req) f = Builder $ \buf0 off0 len0 cs0 s0 ->
   in case unST (f (MutableByteArray buf1) (I# off1)) s1 of
        (# s2, I# off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)

-- Internal. Write two characters in the ASCII plane to a byte array.
-write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
-write2 marr ix a b = do
-  PM.writeByteArray marr ix (c2w a)
-  PM.writeByteArray marr (ix + 1) (c2w b)
+fromFunction# :: Int# -> (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #) ) -> Builder
+{-# inline fromFunction# #-}
+fromFunction# req f = Builder $ \buf0 off0 len0 cs0 s0 ->
+  let !(# s1, buf1, off1, len1, cs1 #) = case len0 >=# req of
+        1# -> (# s0, buf0, off0, len0, cs0 #)
+        _ -> let !(I# lenX) = max 4080 (I# req) in
+          case Exts.newByteArray# lenX s0 of
+            (# sX, bufX #) ->
+              (# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
+   in case f buf1 off1 s1 of
+        (# s2, off2 #) -> (# s2, buf1, off2, len1 -# (off2 -# off1), cs1 #)

 -- | Create a builder from text. The text will be UTF-8 encoded.
 shortTextUtf8 :: ShortText -> Builder
@ -864,6 +850,7 @@ textUtf8 (I.Text (A.ByteArray b) off len) =
 -- * @\\_"_\/ ==\> "\\\\_\\"_\/"@ (escapes backslashes and quotes)
 -- * @hello\<ESC\>world ==> "hello\\u001Bworld"@ (where @\<ESC\>@ is code point 0x1B)
 shortTextJsonString :: ShortText -> Builder
+{-# inline shortTextJsonString #-}
 shortTextJsonString a =
  let !(ByteArray ba) = shortTextToByteArray a
      !(I# len) = PM.sizeofByteArray (ByteArray ba)
@ -1204,23 +1191,20 @@ shortTextToByteArray :: ShortText -> ByteArray
 shortTextToByteArray x = case TS.toShortByteString x of
  SBS a -> ByteArray a

-indexChar8Array :: ByteArray -> Int -> Char
-indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
-
-c2w :: Char -> Word8
-c2w = fromIntegral . ord
-
 -- | Encode a signed machine-sized integer with LEB-128. This uses
 -- zig-zag encoding.
 intLEB128 :: Int -> Builder
+{-# inline intLEB128 #-}
 intLEB128 = wordLEB128 . toZigzagNative

 -- | Encode a 32-bit signed integer with LEB-128. This uses zig-zag encoding.
 int32LEB128 :: Int32 -> Builder
+{-# inline int32LEB128 #-}
 int32LEB128 = word32LEB128 . toZigzag32

 -- | Encode a 64-bit signed integer with LEB-128. This uses zig-zag encoding.
 int64LEB128 :: Int64 -> Builder
+{-# inline int64LEB128 #-}
 int64LEB128 = word64LEB128 . toZigzag64

 -- | Encode a machine-sized word with LEB-128.
--- a/src/Data/Bytes/Builder/Unsafe.hs
+++ b/src/Data/Bytes/Builder/Unsafe.hs
@ -33,22 +33,25 @@ module Data.Bytes.Builder.Unsafe
    -- @Data.Bytes.Builder@ instead.
  , stringUtf8
  , cstring
+    -- * Pasting with Preconditions
+  , pasteUtf8TextJson#
  ) where

 import Control.Monad.Primitive (primitive_)
 import Data.Bytes.Chunks (Chunks(ChunksCons))
 import Data.Bytes.Types (Bytes(Bytes))
+import Data.Char (ord)
 import Data.Primitive (MutableByteArray(..),ByteArray(..))
+import Data.Word (Word8)
 import Foreign.C.String (CString)
 import GHC.Base (unpackCString#,unpackCStringUtf8#)
-import GHC.Exts ((-#),(+#),(>#),(>=#))
+import GHC.Exts ((-#),(+#),(>#),(>=#),Char(C#))
 import GHC.Exts (Addr#,ByteArray#,MutableByteArray#,Int(I#),Ptr(Ptr))
 import GHC.Exts (RealWorld,IsString,Int#,State#)
-import GHC.ST (ST(ST))
 import GHC.IO (stToIO)
+import GHC.ST (ST(ST))

 import qualified Compat as C
-
 import qualified Data.Bytes.Builder.Bounded as Bounded
 import qualified Data.Bytes.Builder.Bounded.Unsafe as UnsafeBounded
 import qualified Data.Primitive as PM
@ -311,3 +314,63 @@ commitDistance target !n (Mutable buf len cs) =
  case Exts.sameMutableByteArray# target buf of
    1# -> n +# len
    _ -> commitDistance target (n +# len) cs
+
+-- | Encode (UTF-8 encoded) text as a JSON string, wrapping it in double quotes.
+-- This escapes all characters with code points below @0x20@.
+--
+-- * Precondition: The slice of the byte argument is UTF-8 encoded text.
+-- * Precondition: There is enough space in the buffer for the result
+--   to be written to. A simple way to ensure enough space is to allocate
+--   @3N + 2@ bytes, where N is the length of the argument. However, the
+--   caller may use clever heuristics to find a lower upper bound.
+-- * Result: The next offset in the destination buffer
+pasteUtf8TextJson# ::
+     ByteArray# -- ^ source
+  -> Int# -- ^ source offset
+  -> Int# -- ^ source length
+  -> MutableByteArray# s -- ^ destination buffer
+  -> Int# -- ^ offset into destination buffer
+  -> State# s -- ^ state token
+  -> (# State# s, Int# #) -- returns next destination offset
+{-# noinline pasteUtf8TextJson# #-}
+pasteUtf8TextJson# src# soff0# slen0# dst# doff0# s0# =
+  let ST f = do
+        let dst = MutableByteArray dst#
+        let doff0 = I# doff0#
+        PM.writeByteArray dst doff0 (c2w '"')
+        let go !soff !slen !doff = if slen > 0
+              then case indexChar8Array (ByteArray src#) soff of
+                '\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
+                '\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
+                c -> if c >= '\x20'
+                  then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
+                  else case c of
+                    '\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
+                    '\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
+                    '\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
+                    _ -> do
+                      write2 dst doff '\\' 'u'
+                      doff' <- UnsafeBounded.pasteST
+                        (Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
+                        dst (doff + 2)
+                      go (soff + 1) (slen - 1) doff'
+              else pure doff
+        doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
+        PM.writeByteArray dst doffRes (c2w '"')
+        pure (doffRes + 1)
+      !(# !s1, I# dstFinal #) = f s0#
+   in (# s1, dstFinal #)
+
+c2w :: Char -> Word8
+{-# inline c2w #-}
+c2w = fromIntegral . ord
+
+-- Internal. Write two characters in the ASCII plane to a byte array.
+write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
+write2 marr ix a b = do
+  PM.writeByteArray marr ix (c2w a)
+  PM.writeByteArray marr (ix + 1) (c2w b)
+
+indexChar8Array :: ByteArray -> Int -> Char
+{-# inline indexChar8Array #-}
+indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)