Add functions for encoding ShortText as UTF-8 and as JSON

This commit is contained in:
Andrew Martin 2019-08-07 15:23:02 -04:00
parent 1fd4b3ab1e
commit 5d3d6d1afd
5 changed files with 106 additions and 6 deletions

View file

@ -7,6 +7,8 @@
* Add a `pasteGrowST` for length-indexed builders.
* Add function for rendering floating-point numbers in a slightly
inaccurate way.
* Add `word16Dec`.
* Add functions for encoding `ShortText` as UTF-8 and as a JSON string.
## 0.1.1.0 -- 2019-07-30

View file

@ -44,6 +44,8 @@ library
, primitive-offset >=0.2 && <0.3
, run-st >=0.1 && <0.2
, vector >=0.12.0.3 && <0.13
, bytestring >=0.10.8.2 && <0.11
, text-short >=0.1.3 && <0.2
if flag(checked)
build-depends: primitive-checked >= 0.7 && <0.8
else

View file

@ -22,9 +22,12 @@ module Data.ByteArray.Builder.Small
-- * Materialized Byte Sequences
, bytes
, bytearray
, shortTextUtf8
, shortTextJsonString
-- * Encode Integral Types
-- ** Human-Readable
, word64Dec
, word16Dec
, int64Dec
, word64PaddedUpperHex
, word32PaddedUpperHex
@ -50,7 +53,12 @@ import GHC.ST
import GHC.Word
import GHC.TypeLits (KnownNat,natVal')
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
import Data.ByteString.Short.Internal (ShortByteString(SBS))
import Data.Text.Short (ShortText)
import Data.Char (ord)
import qualified GHC.Exts as Exts
import qualified Data.Text.Short as TS
import qualified Data.Primitive as PM
import qualified Data.Vector as V
import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
@ -191,12 +199,73 @@ bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len
pure (Just (off + slen))
else pure Nothing
-- Internal function. Precondition, the referenced slice of the
-- byte sequence is UTF-8 encoded text.
slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
{-# inline slicedUtf8TextJson #-}
slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 dlen0) ->
let slen0 = I# slen0#
in if dlen0 > (2 * slen0) + 2
then do
PM.writeByteArray dst doff0 (c2w '"')
let go !soff !slen !doff = if slen > 0
then case indexChar8Array (ByteArray src#) soff of
'\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
'\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
'\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
'\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
'\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
c -> if c >= '\x20'
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
else do
write2 dst doff '\\' 'u'
doff' <- Unsafe.pasteST
(Unsafe.word16PaddedUpperHex (fromIntegral (c2w c)))
dst (doff + 2)
go (soff + 1) (slen - 1) doff'
else pure doff
doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
PM.writeByteArray dst doffRes (c2w '"')
pure (Just (doffRes + 1))
else pure Nothing
-- Internal. Write two characters in the ASCII plane to a byte array.
write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
write2 marr ix a b = do
PM.writeByteArray marr ix (c2w a)
PM.writeByteArray marr (ix + 1) (c2w b)
-- | Create a builder from text. The text will be UTF-8 encoded.
shortTextUtf8 :: ShortText -> Builder
shortTextUtf8 a =
let ba = shortTextToByteArray a
in bytes (Bytes ba 0 (sizeofByteArray ba))
-- | Create a builder from text. The text will be UTF-8 encoded,
-- and JSON special characters will be escaped. Additionally, the
-- result is surrounded by double quotes. For example:
--
-- * @foo ==> "foo"@
-- * @\_"_/ ==> "\\_\"_/"@
-- * @hello<ESC>world ==> "hello\u001Bworld"@ (where <LF> is code point 0x1B)
shortTextJsonString :: ShortText -> Builder
shortTextJsonString a =
let !(ByteArray ba) = shortTextToByteArray a
!(I# len) = PM.sizeofByteArray (ByteArray ba)
in slicedUtf8TextJson ba 0# len
-- | Encodes an unsigned 64-bit integer as decimal.
-- This encoding never starts with a zero unless the
-- argument was zero.
word64Dec :: Word64 -> Builder
word64Dec w = fromUnsafe (Unsafe.word64Dec w)
-- | Encodes an unsigned 16-bit integer as decimal.
-- This encoding never starts with a zero unless the
-- argument was zero.
word16Dec :: Word16 -> Builder
word16Dec w = fromUnsafe (Unsafe.word16Dec w)
-- | Encode a double-floating-point number, using decimal notation or
-- scientific notation depending on the magnitude. This has undefined
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
@ -260,3 +329,14 @@ word32BE w = fromUnsafe (Unsafe.word32BE w)
-- word in a big-endian fashion.
word16BE :: Word16 -> Builder
word16BE w = fromUnsafe (Unsafe.word16BE w)
-- ShortText is already UTF-8 encoded. This is a no-op.
shortTextToByteArray :: ShortText -> ByteArray
shortTextToByteArray x = case TS.toShortByteString x of
SBS a -> ByteArray a
indexChar8Array :: ByteArray -> Int -> Char
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
c2w :: Char -> Word8
c2w = fromIntegral . ord

View file

@ -26,6 +26,7 @@ module Data.ByteArray.Builder.Small.Unsafe
-- * Encode Integral Types
-- ** Human-Readable
, word64Dec
, word16Dec
, int64Dec
, word64PaddedUpperHex
, word32PaddedUpperHex
@ -139,7 +140,12 @@ doubleDec (D# d) = Builder (\arr off0 s0 -> doubleDec# d arr off0 s0)
-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word64Dec :: Word64 -> Builder 19
word64Dec (W64# w) = word64Dec# w
word64Dec (W64# w) = wordCommonDec# w
-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word16Dec :: Word16 -> Builder 5
word16Dec (W16# w) = wordCommonDec# w
-- | Requires up to 20 bytes. Encodes a signed 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
@ -148,10 +154,11 @@ word64Dec (W64# w) = word64Dec# w
int64Dec :: Int64 -> Builder 20
int64Dec (I64# w) = int64Dec# w
-- | Requires up to 19 bytes.
word64Dec# :: Word# -> Builder 19
{-# noinline word64Dec# #-}
word64Dec# w# = construct $ \arr off0 -> if w /= 0
-- Requires a number of bytes that is bounded by the size of
-- the word. This is only used internally.
wordCommonDec# :: Word# -> Builder n
{-# noinline wordCommonDec# #-}
wordCommonDec# w# = construct $ \arr off0 -> if w /= 0
then internalWordLoop arr off0 (W# w#)
else do
writeByteArray arr off0 (c2w '0')
@ -353,7 +360,7 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
-- This is adapted from androider's code in https://stackoverflow.com/a/7097567
-- The checks for infinity and NaN have been removed. Note that this is a little
-- inaccurate. This is very visible when encoding a number like 2.25, which
-- is perfectly represented as a IEEE 754 floating point number but is goofed
-- is perfectly represented as an IEEE 754 floating point number but is goofed
-- up by this function.
-- If you modify this function, please take a took at the resulting core.
-- It currently performs no boxing at all, and it would be nice to keep

View file

@ -1,6 +1,7 @@
{-# language BangPatterns #-}
{-# language ScopedTypeVariables #-}
{-# language TypeApplications #-}
{-# language OverloadedStrings #-}
import Control.Monad.ST (runST)
import Data.Bytes.Types (MutableBytes(..))
@ -75,6 +76,14 @@ tests = testGroup "Tests"
pack ("999999999") @=? run 1 (doubleDec 999999999)
, THU.testCase "doubleDec-K" $
pack ("-99999999") @=? run 1 (doubleDec (-99999999))
, THU.testCase "shortTextJsonString-A" $
pack ("\"hello\"") @=? run 1 (shortTextJsonString "hello")
, THU.testCase "shortTextJsonString-B" $
pack ("\"\\\\_\\\"_/\"") @=? run 1 (shortTextJsonString "\\_\"_/")
, THU.testCase "shortTextJsonString-C" $
pack ("\"Hi\\r\\nLo\"") @=? run 1 (shortTextJsonString "Hi\r\nLo")
, THU.testCase "shortTextJsonString-D" $
pack ("\"Hi\\u001BLo\"") @=? run 1 (shortTextJsonString "Hi\ESCLo")
]
, testGroup "alternate"
[ TQC.testProperty "HexWord64" $ \x y ->