Add functions for encoding ShortText as UTF-8 and as JSON

2019-08-07 15:23:02 -04:00 · 2019-08-07 15:23:02 -04:00 · 5d3d6d1afd
commit 5d3d6d1afd
parent 1fd4b3ab1e
5 changed files with 106 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,8 @@
 * Add a `pasteGrowST` for length-indexed builders.
 * Add function for rendering floating-point numbers in a slightly
  inaccurate way.
+* Add `word16Dec`.
+* Add functions for encoding `ShortText` as UTF-8 and as a JSON string.

 ## 0.1.1.0 -- 2019-07-30

--- a/small-bytearray-builder.cabal
+++ b/small-bytearray-builder.cabal
@ -44,6 +44,8 @@ library
    , primitive-offset >=0.2 && <0.3
    , run-st >=0.1 && <0.2
    , vector >=0.12.0.3 && <0.13
+    , bytestring >=0.10.8.2 && <0.11
+    , text-short >=0.1.3 && <0.2
  if flag(checked)
    build-depends: primitive-checked >= 0.7 && <0.8
  else
--- a/src/Data/ByteArray/Builder/Small.hs
+++ b/src/Data/ByteArray/Builder/Small.hs
@ -22,9 +22,12 @@ module Data.ByteArray.Builder.Small
    -- * Materialized Byte Sequences
  , bytes
  , bytearray
+  , shortTextUtf8
+  , shortTextJsonString
    -- * Encode Integral Types
    -- ** Human-Readable
  , word64Dec
+  , word16Dec
  , int64Dec
  , word64PaddedUpperHex
  , word32PaddedUpperHex
@ -50,7 +53,12 @@ import GHC.ST
 import GHC.Word
 import GHC.TypeLits (KnownNat,natVal')
 import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
+import Data.ByteString.Short.Internal (ShortByteString(SBS))
+import Data.Text.Short (ShortText)
+import Data.Char (ord)

+import qualified GHC.Exts as Exts
+import qualified Data.Text.Short as TS
 import qualified Data.Primitive as PM
 import qualified Data.Vector as V
 import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
@ -191,12 +199,73 @@ bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len
    pure (Just (off + slen))
  else pure Nothing

+-- Internal function. Precondition, the referenced slice of the
+-- byte sequence is UTF-8 encoded text.
+slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
+{-# inline slicedUtf8TextJson #-}
+slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 dlen0) ->
+  let slen0 = I# slen0#
+   in if dlen0 > (2 * slen0) + 2
+        then do
+          PM.writeByteArray dst doff0 (c2w '"')
+          let go !soff !slen !doff = if slen > 0
+                then case indexChar8Array (ByteArray src#) soff of
+                  '\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
+                  '\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
+                  '\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
+                  '\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
+                  '\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
+                  c -> if c >= '\x20'
+                    then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
+                    else do
+                      write2 dst doff '\\' 'u'
+                      doff' <- Unsafe.pasteST
+                        (Unsafe.word16PaddedUpperHex (fromIntegral (c2w c)))
+                        dst (doff + 2)
+                      go (soff + 1) (slen - 1) doff'
+                else pure doff
+          doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
+          PM.writeByteArray dst doffRes (c2w '"')
+          pure (Just (doffRes + 1))
+        else pure Nothing
+
+-- Internal. Write two characters in the ASCII plane to a byte array.
+write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
+write2 marr ix a b = do
+  PM.writeByteArray marr ix (c2w a)
+  PM.writeByteArray marr (ix + 1) (c2w b)
+
+-- | Create a builder from text. The text will be UTF-8 encoded.
+shortTextUtf8 :: ShortText -> Builder
+shortTextUtf8 a =
+  let ba = shortTextToByteArray a
+   in bytes (Bytes ba 0 (sizeofByteArray ba))
+
+-- | Create a builder from text. The text will be UTF-8 encoded,
+-- and JSON special characters will be escaped. Additionally, the
+-- result is surrounded by double quotes. For example:
+--
+-- * @foo ==> "foo"@
+-- * @\_"_/ ==> "\\_\"_/"@
+-- * @hello<ESC>world ==> "hello\u001Bworld"@ (where <LF> is code point 0x1B)
+shortTextJsonString :: ShortText -> Builder
+shortTextJsonString a =
+  let !(ByteArray ba) = shortTextToByteArray a
+      !(I# len) = PM.sizeofByteArray (ByteArray ba)
+   in slicedUtf8TextJson ba 0# len
+
 -- | Encodes an unsigned 64-bit integer as decimal.
 -- This encoding never starts with a zero unless the
 -- argument was zero.
 word64Dec :: Word64 -> Builder
 word64Dec w = fromUnsafe (Unsafe.word64Dec w)

+-- | Encodes an unsigned 16-bit integer as decimal.
+-- This encoding never starts with a zero unless the
+-- argument was zero.
+word16Dec :: Word16 -> Builder
+word16Dec w = fromUnsafe (Unsafe.word16Dec w)
+
 -- | Encode a double-floating-point number, using decimal notation or
 -- scientific notation depending on the magnitude. This has undefined
 -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
@ -260,3 +329,14 @@ word32BE w = fromUnsafe (Unsafe.word32BE w)
 -- word in a big-endian fashion.
 word16BE :: Word16 -> Builder
 word16BE w = fromUnsafe (Unsafe.word16BE w)
+
+-- ShortText is already UTF-8 encoded. This is a no-op.
+shortTextToByteArray :: ShortText -> ByteArray
+shortTextToByteArray x = case TS.toShortByteString x of
+  SBS a -> ByteArray a
+
+indexChar8Array :: ByteArray -> Int -> Char
+indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
+
+c2w :: Char -> Word8
+c2w = fromIntegral . ord 
--- a/src/Data/ByteArray/Builder/Small/Unsafe.hs
+++ b/src/Data/ByteArray/Builder/Small/Unsafe.hs
@ -26,6 +26,7 @@ module Data.ByteArray.Builder.Small.Unsafe
    -- * Encode Integral Types
    -- ** Human-Readable
  , word64Dec
+  , word16Dec
  , int64Dec
  , word64PaddedUpperHex
  , word32PaddedUpperHex
@ -139,7 +140,12 @@ doubleDec (D# d) = Builder (\arr off0 s0 -> doubleDec# d arr off0 s0)
 -- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
 -- This encoding never starts with a zero unless the argument was zero.
 word64Dec :: Word64 -> Builder 19
-word64Dec (W64# w) = word64Dec# w
+word64Dec (W64# w) = wordCommonDec# w
+
+-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
+-- This encoding never starts with a zero unless the argument was zero.
+word16Dec :: Word16 -> Builder 5
+word16Dec (W16# w) = wordCommonDec# w

 -- | Requires up to 20 bytes. Encodes a signed 64-bit integer as decimal.
 -- This encoding never starts with a zero unless the argument was zero.
@ -148,10 +154,11 @@ word64Dec (W64# w) = word64Dec# w
 int64Dec :: Int64 -> Builder 20
 int64Dec (I64# w) = int64Dec# w

-- | Requires up to 19 bytes.
-word64Dec# :: Word# -> Builder 19
-{-# noinline word64Dec# #-}
-word64Dec# w# = construct $ \arr off0 -> if w /= 0
+-- Requires a number of bytes that is bounded by the size of
+-- the word. This is only used internally.
+wordCommonDec# :: Word# -> Builder n
+{-# noinline wordCommonDec# #-}
+wordCommonDec# w# = construct $ \arr off0 -> if w /= 0
  then internalWordLoop arr off0 (W# w#)
  else do
    writeByteArray arr off0 (c2w '0')
@ -353,7 +360,7 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
 -- This is adapted from androider's code in https://stackoverflow.com/a/7097567
 -- The checks for infinity and NaN have been removed. Note that this is a little
 -- inaccurate. This is very visible when encoding a number like 2.25, which
-- is perfectly represented as a IEEE 754 floating point number but is goofed
+-- is perfectly represented as an IEEE 754 floating point number but is goofed
 -- up by this function.
 -- If you modify this function, please take a took at the resulting core.
 -- It currently performs no boxing at all, and it would be nice to keep
--- a/test/Main.hs
+++ b/test/Main.hs
@ -1,6 +1,7 @@
 {-# language BangPatterns #-}
 {-# language ScopedTypeVariables #-}
 {-# language TypeApplications #-}
+{-# language OverloadedStrings #-}

 import Control.Monad.ST (runST)
 import Data.Bytes.Types (MutableBytes(..))
@ -75,6 +76,14 @@ tests = testGroup "Tests"
        pack ("999999999") @=? run 1 (doubleDec 999999999)
    , THU.testCase "doubleDec-K" $
        pack ("-99999999") @=? run 1 (doubleDec (-99999999))
+    , THU.testCase "shortTextJsonString-A" $
+        pack ("\"hello\"") @=? run 1 (shortTextJsonString "hello")
+    , THU.testCase "shortTextJsonString-B" $
+        pack ("\"\\\\_\\\"_/\"") @=? run 1 (shortTextJsonString "\\_\"_/")
+    , THU.testCase "shortTextJsonString-C" $
+        pack ("\"Hi\\r\\nLo\"") @=? run 1 (shortTextJsonString "Hi\r\nLo")
+    , THU.testCase "shortTextJsonString-D" $
+        pack ("\"Hi\\u001BLo\"") @=? run 1 (shortTextJsonString "Hi\ESCLo")
    ]
  , testGroup "alternate"
    [ TQC.testProperty "HexWord64" $ \x y ->