Add functions for encoding ShortText as UTF-8 and as JSON

This commit is contained in:
Andrew Martin 2019-08-07 15:23:02 -04:00
parent 1fd4b3ab1e
commit 5d3d6d1afd
5 changed files with 106 additions and 6 deletions

View file

@ -7,6 +7,8 @@
* Add a `pasteGrowST` for length-indexed builders. * Add a `pasteGrowST` for length-indexed builders.
* Add function for rendering floating-point numbers in a slightly * Add function for rendering floating-point numbers in a slightly
inaccurate way. inaccurate way.
* Add `word16Dec`.
* Add functions for encoding `ShortText` as UTF-8 and as a JSON string.
## 0.1.1.0 -- 2019-07-30 ## 0.1.1.0 -- 2019-07-30

View file

@ -44,6 +44,8 @@ library
, primitive-offset >=0.2 && <0.3 , primitive-offset >=0.2 && <0.3
, run-st >=0.1 && <0.2 , run-st >=0.1 && <0.2
, vector >=0.12.0.3 && <0.13 , vector >=0.12.0.3 && <0.13
, bytestring >=0.10.8.2 && <0.11
, text-short >=0.1.3 && <0.2
if flag(checked) if flag(checked)
build-depends: primitive-checked >= 0.7 && <0.8 build-depends: primitive-checked >= 0.7 && <0.8
else else

View file

@ -22,9 +22,12 @@ module Data.ByteArray.Builder.Small
-- * Materialized Byte Sequences -- * Materialized Byte Sequences
, bytes , bytes
, bytearray , bytearray
, shortTextUtf8
, shortTextJsonString
-- * Encode Integral Types -- * Encode Integral Types
-- ** Human-Readable -- ** Human-Readable
, word64Dec , word64Dec
, word16Dec
, int64Dec , int64Dec
, word64PaddedUpperHex , word64PaddedUpperHex
, word32PaddedUpperHex , word32PaddedUpperHex
@ -50,7 +53,12 @@ import GHC.ST
import GHC.Word import GHC.Word
import GHC.TypeLits (KnownNat,natVal') import GHC.TypeLits (KnownNat,natVal')
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
import Data.ByteString.Short.Internal (ShortByteString(SBS))
import Data.Text.Short (ShortText)
import Data.Char (ord)
import qualified GHC.Exts as Exts
import qualified Data.Text.Short as TS
import qualified Data.Primitive as PM import qualified Data.Primitive as PM
import qualified Data.Vector as V import qualified Data.Vector as V
import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
@ -191,12 +199,73 @@ bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len
pure (Just (off + slen)) pure (Just (off + slen))
else pure Nothing else pure Nothing
-- Internal function. Precondition, the referenced slice of the
-- byte sequence is UTF-8 encoded text.
slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
{-# inline slicedUtf8TextJson #-}
slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 dlen0) ->
let slen0 = I# slen0#
in if dlen0 > (2 * slen0) + 2
then do
PM.writeByteArray dst doff0 (c2w '"')
let go !soff !slen !doff = if slen > 0
then case indexChar8Array (ByteArray src#) soff of
'\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
'\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
'\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
'\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
'\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
c -> if c >= '\x20'
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
else do
write2 dst doff '\\' 'u'
doff' <- Unsafe.pasteST
(Unsafe.word16PaddedUpperHex (fromIntegral (c2w c)))
dst (doff + 2)
go (soff + 1) (slen - 1) doff'
else pure doff
doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
PM.writeByteArray dst doffRes (c2w '"')
pure (Just (doffRes + 1))
else pure Nothing
-- Internal. Write two characters in the ASCII plane to a byte array.
write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
write2 marr ix a b = do
PM.writeByteArray marr ix (c2w a)
PM.writeByteArray marr (ix + 1) (c2w b)
-- | Create a builder from text. The text will be UTF-8 encoded.
shortTextUtf8 :: ShortText -> Builder
shortTextUtf8 a =
let ba = shortTextToByteArray a
in bytes (Bytes ba 0 (sizeofByteArray ba))
-- | Create a builder from text. The text will be UTF-8 encoded,
-- and JSON special characters will be escaped. Additionally, the
-- result is surrounded by double quotes. For example:
--
-- * @foo ==> "foo"@
-- * @\_"_/ ==> "\\_\"_/"@
-- * @hello<ESC>world ==> "hello\u001Bworld"@ (where <LF> is code point 0x1B)
shortTextJsonString :: ShortText -> Builder
shortTextJsonString a =
let !(ByteArray ba) = shortTextToByteArray a
!(I# len) = PM.sizeofByteArray (ByteArray ba)
in slicedUtf8TextJson ba 0# len
-- | Encodes an unsigned 64-bit integer as decimal. -- | Encodes an unsigned 64-bit integer as decimal.
-- This encoding never starts with a zero unless the -- This encoding never starts with a zero unless the
-- argument was zero. -- argument was zero.
word64Dec :: Word64 -> Builder word64Dec :: Word64 -> Builder
word64Dec w = fromUnsafe (Unsafe.word64Dec w) word64Dec w = fromUnsafe (Unsafe.word64Dec w)
-- | Encodes an unsigned 16-bit integer as decimal.
-- This encoding never starts with a zero unless the
-- argument was zero.
word16Dec :: Word16 -> Builder
word16Dec w = fromUnsafe (Unsafe.word16Dec w)
-- | Encode a double-floating-point number, using decimal notation or -- | Encode a double-floating-point number, using decimal notation or
-- scientific notation depending on the magnitude. This has undefined -- scientific notation depending on the magnitude. This has undefined
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
@ -260,3 +329,14 @@ word32BE w = fromUnsafe (Unsafe.word32BE w)
-- word in a big-endian fashion. -- word in a big-endian fashion.
word16BE :: Word16 -> Builder word16BE :: Word16 -> Builder
word16BE w = fromUnsafe (Unsafe.word16BE w) word16BE w = fromUnsafe (Unsafe.word16BE w)
-- ShortText is already UTF-8 encoded. This is a no-op.
shortTextToByteArray :: ShortText -> ByteArray
shortTextToByteArray x = case TS.toShortByteString x of
SBS a -> ByteArray a
indexChar8Array :: ByteArray -> Int -> Char
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
c2w :: Char -> Word8
c2w = fromIntegral . ord

View file

@ -26,6 +26,7 @@ module Data.ByteArray.Builder.Small.Unsafe
-- * Encode Integral Types -- * Encode Integral Types
-- ** Human-Readable -- ** Human-Readable
, word64Dec , word64Dec
, word16Dec
, int64Dec , int64Dec
, word64PaddedUpperHex , word64PaddedUpperHex
, word32PaddedUpperHex , word32PaddedUpperHex
@ -139,7 +140,12 @@ doubleDec (D# d) = Builder (\arr off0 s0 -> doubleDec# d arr off0 s0)
-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal. -- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero. -- This encoding never starts with a zero unless the argument was zero.
word64Dec :: Word64 -> Builder 19 word64Dec :: Word64 -> Builder 19
word64Dec (W64# w) = word64Dec# w word64Dec (W64# w) = wordCommonDec# w
-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word16Dec :: Word16 -> Builder 5
word16Dec (W16# w) = wordCommonDec# w
-- | Requires up to 20 bytes. Encodes a signed 64-bit integer as decimal. -- | Requires up to 20 bytes. Encodes a signed 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero. -- This encoding never starts with a zero unless the argument was zero.
@ -148,10 +154,11 @@ word64Dec (W64# w) = word64Dec# w
int64Dec :: Int64 -> Builder 20 int64Dec :: Int64 -> Builder 20
int64Dec (I64# w) = int64Dec# w int64Dec (I64# w) = int64Dec# w
-- | Requires up to 19 bytes. -- Requires a number of bytes that is bounded by the size of
word64Dec# :: Word# -> Builder 19 -- the word. This is only used internally.
{-# noinline word64Dec# #-} wordCommonDec# :: Word# -> Builder n
word64Dec# w# = construct $ \arr off0 -> if w /= 0 {-# noinline wordCommonDec# #-}
wordCommonDec# w# = construct $ \arr off0 -> if w /= 0
then internalWordLoop arr off0 (W# w#) then internalWordLoop arr off0 (W# w#)
else do else do
writeByteArray arr off0 (c2w '0') writeByteArray arr off0 (c2w '0')
@ -353,7 +360,7 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
-- This is adapted from androider's code in https://stackoverflow.com/a/7097567 -- This is adapted from androider's code in https://stackoverflow.com/a/7097567
-- The checks for infinity and NaN have been removed. Note that this is a little -- The checks for infinity and NaN have been removed. Note that this is a little
-- inaccurate. This is very visible when encoding a number like 2.25, which -- inaccurate. This is very visible when encoding a number like 2.25, which
-- is perfectly represented as a IEEE 754 floating point number but is goofed -- is perfectly represented as an IEEE 754 floating point number but is goofed
-- up by this function. -- up by this function.
-- If you modify this function, please take a took at the resulting core. -- If you modify this function, please take a took at the resulting core.
-- It currently performs no boxing at all, and it would be nice to keep -- It currently performs no boxing at all, and it would be nice to keep

View file

@ -1,6 +1,7 @@
{-# language BangPatterns #-} {-# language BangPatterns #-}
{-# language ScopedTypeVariables #-} {-# language ScopedTypeVariables #-}
{-# language TypeApplications #-} {-# language TypeApplications #-}
{-# language OverloadedStrings #-}
import Control.Monad.ST (runST) import Control.Monad.ST (runST)
import Data.Bytes.Types (MutableBytes(..)) import Data.Bytes.Types (MutableBytes(..))
@ -75,6 +76,14 @@ tests = testGroup "Tests"
pack ("999999999") @=? run 1 (doubleDec 999999999) pack ("999999999") @=? run 1 (doubleDec 999999999)
, THU.testCase "doubleDec-K" $ , THU.testCase "doubleDec-K" $
pack ("-99999999") @=? run 1 (doubleDec (-99999999)) pack ("-99999999") @=? run 1 (doubleDec (-99999999))
, THU.testCase "shortTextJsonString-A" $
pack ("\"hello\"") @=? run 1 (shortTextJsonString "hello")
, THU.testCase "shortTextJsonString-B" $
pack ("\"\\\\_\\\"_/\"") @=? run 1 (shortTextJsonString "\\_\"_/")
, THU.testCase "shortTextJsonString-C" $
pack ("\"Hi\\r\\nLo\"") @=? run 1 (shortTextJsonString "Hi\r\nLo")
, THU.testCase "shortTextJsonString-D" $
pack ("\"Hi\\u001BLo\"") @=? run 1 (shortTextJsonString "Hi\ESCLo")
] ]
, testGroup "alternate" , testGroup "alternate"
[ TQC.testProperty "HexWord64" $ \x y -> [ TQC.testProperty "HexWord64" $ \x y ->