Add functions for encoding ShortText as UTF-8 and as JSON
This commit is contained in:
parent
1fd4b3ab1e
commit
5d3d6d1afd
5 changed files with 106 additions and 6 deletions
|
@ -7,6 +7,8 @@
|
|||
* Add a `pasteGrowST` for length-indexed builders.
|
||||
* Add function for rendering floating-point numbers in a slightly
|
||||
inaccurate way.
|
||||
* Add `word16Dec`.
|
||||
* Add functions for encoding `ShortText` as UTF-8 and as a JSON string.
|
||||
|
||||
## 0.1.1.0 -- 2019-07-30
|
||||
|
||||
|
|
|
@ -44,6 +44,8 @@ library
|
|||
, primitive-offset >=0.2 && <0.3
|
||||
, run-st >=0.1 && <0.2
|
||||
, vector >=0.12.0.3 && <0.13
|
||||
, bytestring >=0.10.8.2 && <0.11
|
||||
, text-short >=0.1.3 && <0.2
|
||||
if flag(checked)
|
||||
build-depends: primitive-checked >= 0.7 && <0.8
|
||||
else
|
||||
|
|
|
@ -22,9 +22,12 @@ module Data.ByteArray.Builder.Small
|
|||
-- * Materialized Byte Sequences
|
||||
, bytes
|
||||
, bytearray
|
||||
, shortTextUtf8
|
||||
, shortTextJsonString
|
||||
-- * Encode Integral Types
|
||||
-- ** Human-Readable
|
||||
, word64Dec
|
||||
, word16Dec
|
||||
, int64Dec
|
||||
, word64PaddedUpperHex
|
||||
, word32PaddedUpperHex
|
||||
|
@ -50,7 +53,12 @@ import GHC.ST
|
|||
import GHC.Word
|
||||
import GHC.TypeLits (KnownNat,natVal')
|
||||
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
|
||||
import Data.ByteString.Short.Internal (ShortByteString(SBS))
|
||||
import Data.Text.Short (ShortText)
|
||||
import Data.Char (ord)
|
||||
|
||||
import qualified GHC.Exts as Exts
|
||||
import qualified Data.Text.Short as TS
|
||||
import qualified Data.Primitive as PM
|
||||
import qualified Data.Vector as V
|
||||
import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
|
||||
|
@ -191,12 +199,73 @@ bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len
|
|||
pure (Just (off + slen))
|
||||
else pure Nothing
|
||||
|
||||
-- Internal function. Precondition, the referenced slice of the
|
||||
-- byte sequence is UTF-8 encoded text.
|
||||
slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
|
||||
{-# inline slicedUtf8TextJson #-}
|
||||
slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 dlen0) ->
|
||||
let slen0 = I# slen0#
|
||||
in if dlen0 > (2 * slen0) + 2
|
||||
then do
|
||||
PM.writeByteArray dst doff0 (c2w '"')
|
||||
let go !soff !slen !doff = if slen > 0
|
||||
then case indexChar8Array (ByteArray src#) soff of
|
||||
'\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
'\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
'\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
'\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
'\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||
c -> if c >= '\x20'
|
||||
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
|
||||
else do
|
||||
write2 dst doff '\\' 'u'
|
||||
doff' <- Unsafe.pasteST
|
||||
(Unsafe.word16PaddedUpperHex (fromIntegral (c2w c)))
|
||||
dst (doff + 2)
|
||||
go (soff + 1) (slen - 1) doff'
|
||||
else pure doff
|
||||
doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
|
||||
PM.writeByteArray dst doffRes (c2w '"')
|
||||
pure (Just (doffRes + 1))
|
||||
else pure Nothing
|
||||
|
||||
-- Internal. Write two characters in the ASCII plane to a byte array.
|
||||
write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
|
||||
write2 marr ix a b = do
|
||||
PM.writeByteArray marr ix (c2w a)
|
||||
PM.writeByteArray marr (ix + 1) (c2w b)
|
||||
|
||||
-- | Create a builder from text. The text will be UTF-8 encoded.
|
||||
shortTextUtf8 :: ShortText -> Builder
|
||||
shortTextUtf8 a =
|
||||
let ba = shortTextToByteArray a
|
||||
in bytes (Bytes ba 0 (sizeofByteArray ba))
|
||||
|
||||
-- | Create a builder from text. The text will be UTF-8 encoded,
|
||||
-- and JSON special characters will be escaped. Additionally, the
|
||||
-- result is surrounded by double quotes. For example:
|
||||
--
|
||||
-- * @foo ==> "foo"@
|
||||
-- * @\_"_/ ==> "\\_\"_/"@
|
||||
-- * @hello<ESC>world ==> "hello\u001Bworld"@ (where <LF> is code point 0x1B)
|
||||
shortTextJsonString :: ShortText -> Builder
|
||||
shortTextJsonString a =
|
||||
let !(ByteArray ba) = shortTextToByteArray a
|
||||
!(I# len) = PM.sizeofByteArray (ByteArray ba)
|
||||
in slicedUtf8TextJson ba 0# len
|
||||
|
||||
-- | Encodes an unsigned 64-bit integer as decimal.
|
||||
-- This encoding never starts with a zero unless the
|
||||
-- argument was zero.
|
||||
word64Dec :: Word64 -> Builder
|
||||
word64Dec w = fromUnsafe (Unsafe.word64Dec w)
|
||||
|
||||
-- | Encodes an unsigned 16-bit integer as decimal.
|
||||
-- This encoding never starts with a zero unless the
|
||||
-- argument was zero.
|
||||
word16Dec :: Word16 -> Builder
|
||||
word16Dec w = fromUnsafe (Unsafe.word16Dec w)
|
||||
|
||||
-- | Encode a double-floating-point number, using decimal notation or
|
||||
-- scientific notation depending on the magnitude. This has undefined
|
||||
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
|
||||
|
@ -260,3 +329,14 @@ word32BE w = fromUnsafe (Unsafe.word32BE w)
|
|||
-- word in a big-endian fashion.
|
||||
word16BE :: Word16 -> Builder
|
||||
word16BE w = fromUnsafe (Unsafe.word16BE w)
|
||||
|
||||
-- ShortText is already UTF-8 encoded. This is a no-op.
|
||||
shortTextToByteArray :: ShortText -> ByteArray
|
||||
shortTextToByteArray x = case TS.toShortByteString x of
|
||||
SBS a -> ByteArray a
|
||||
|
||||
indexChar8Array :: ByteArray -> Int -> Char
|
||||
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
|
||||
|
||||
c2w :: Char -> Word8
|
||||
c2w = fromIntegral . ord
|
||||
|
|
|
@ -26,6 +26,7 @@ module Data.ByteArray.Builder.Small.Unsafe
|
|||
-- * Encode Integral Types
|
||||
-- ** Human-Readable
|
||||
, word64Dec
|
||||
, word16Dec
|
||||
, int64Dec
|
||||
, word64PaddedUpperHex
|
||||
, word32PaddedUpperHex
|
||||
|
@ -139,7 +140,12 @@ doubleDec (D# d) = Builder (\arr off0 s0 -> doubleDec# d arr off0 s0)
|
|||
-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
|
||||
-- This encoding never starts with a zero unless the argument was zero.
|
||||
word64Dec :: Word64 -> Builder 19
|
||||
word64Dec (W64# w) = word64Dec# w
|
||||
word64Dec (W64# w) = wordCommonDec# w
|
||||
|
||||
-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
|
||||
-- This encoding never starts with a zero unless the argument was zero.
|
||||
word16Dec :: Word16 -> Builder 5
|
||||
word16Dec (W16# w) = wordCommonDec# w
|
||||
|
||||
-- | Requires up to 20 bytes. Encodes a signed 64-bit integer as decimal.
|
||||
-- This encoding never starts with a zero unless the argument was zero.
|
||||
|
@ -148,10 +154,11 @@ word64Dec (W64# w) = word64Dec# w
|
|||
int64Dec :: Int64 -> Builder 20
|
||||
int64Dec (I64# w) = int64Dec# w
|
||||
|
||||
-- | Requires up to 19 bytes.
|
||||
word64Dec# :: Word# -> Builder 19
|
||||
{-# noinline word64Dec# #-}
|
||||
word64Dec# w# = construct $ \arr off0 -> if w /= 0
|
||||
-- Requires a number of bytes that is bounded by the size of
|
||||
-- the word. This is only used internally.
|
||||
wordCommonDec# :: Word# -> Builder n
|
||||
{-# noinline wordCommonDec# #-}
|
||||
wordCommonDec# w# = construct $ \arr off0 -> if w /= 0
|
||||
then internalWordLoop arr off0 (W# w#)
|
||||
else do
|
||||
writeByteArray arr off0 (c2w '0')
|
||||
|
@ -353,7 +360,7 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
|
|||
-- This is adapted from androider's code in https://stackoverflow.com/a/7097567
|
||||
-- The checks for infinity and NaN have been removed. Note that this is a little
|
||||
-- inaccurate. This is very visible when encoding a number like 2.25, which
|
||||
-- is perfectly represented as a IEEE 754 floating point number but is goofed
|
||||
-- is perfectly represented as an IEEE 754 floating point number but is goofed
|
||||
-- up by this function.
|
||||
-- If you modify this function, please take a took at the resulting core.
|
||||
-- It currently performs no boxing at all, and it would be nice to keep
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
{-# language BangPatterns #-}
|
||||
{-# language ScopedTypeVariables #-}
|
||||
{-# language TypeApplications #-}
|
||||
{-# language OverloadedStrings #-}
|
||||
|
||||
import Control.Monad.ST (runST)
|
||||
import Data.Bytes.Types (MutableBytes(..))
|
||||
|
@ -75,6 +76,14 @@ tests = testGroup "Tests"
|
|||
pack ("999999999") @=? run 1 (doubleDec 999999999)
|
||||
, THU.testCase "doubleDec-K" $
|
||||
pack ("-99999999") @=? run 1 (doubleDec (-99999999))
|
||||
, THU.testCase "shortTextJsonString-A" $
|
||||
pack ("\"hello\"") @=? run 1 (shortTextJsonString "hello")
|
||||
, THU.testCase "shortTextJsonString-B" $
|
||||
pack ("\"\\\\_\\\"_/\"") @=? run 1 (shortTextJsonString "\\_\"_/")
|
||||
, THU.testCase "shortTextJsonString-C" $
|
||||
pack ("\"Hi\\r\\nLo\"") @=? run 1 (shortTextJsonString "Hi\r\nLo")
|
||||
, THU.testCase "shortTextJsonString-D" $
|
||||
pack ("\"Hi\\u001BLo\"") @=? run 1 (shortTextJsonString "Hi\ESCLo")
|
||||
]
|
||||
, testGroup "alternate"
|
||||
[ TQC.testProperty "HexWord64" $ \x y ->
|
||||
|
|
Loading…
Reference in a new issue