Add functions for encoding ShortText as UTF-8 and as JSON
This commit is contained in:
parent
1fd4b3ab1e
commit
5d3d6d1afd
5 changed files with 106 additions and 6 deletions
|
@ -7,6 +7,8 @@
|
||||||
* Add a `pasteGrowST` for length-indexed builders.
|
* Add a `pasteGrowST` for length-indexed builders.
|
||||||
* Add function for rendering floating-point numbers in a slightly
|
* Add function for rendering floating-point numbers in a slightly
|
||||||
inaccurate way.
|
inaccurate way.
|
||||||
|
* Add `word16Dec`.
|
||||||
|
* Add functions for encoding `ShortText` as UTF-8 and as a JSON string.
|
||||||
|
|
||||||
## 0.1.1.0 -- 2019-07-30
|
## 0.1.1.0 -- 2019-07-30
|
||||||
|
|
||||||
|
|
|
@ -44,6 +44,8 @@ library
|
||||||
, primitive-offset >=0.2 && <0.3
|
, primitive-offset >=0.2 && <0.3
|
||||||
, run-st >=0.1 && <0.2
|
, run-st >=0.1 && <0.2
|
||||||
, vector >=0.12.0.3 && <0.13
|
, vector >=0.12.0.3 && <0.13
|
||||||
|
, bytestring >=0.10.8.2 && <0.11
|
||||||
|
, text-short >=0.1.3 && <0.2
|
||||||
if flag(checked)
|
if flag(checked)
|
||||||
build-depends: primitive-checked >= 0.7 && <0.8
|
build-depends: primitive-checked >= 0.7 && <0.8
|
||||||
else
|
else
|
||||||
|
|
|
@ -22,9 +22,12 @@ module Data.ByteArray.Builder.Small
|
||||||
-- * Materialized Byte Sequences
|
-- * Materialized Byte Sequences
|
||||||
, bytes
|
, bytes
|
||||||
, bytearray
|
, bytearray
|
||||||
|
, shortTextUtf8
|
||||||
|
, shortTextJsonString
|
||||||
-- * Encode Integral Types
|
-- * Encode Integral Types
|
||||||
-- ** Human-Readable
|
-- ** Human-Readable
|
||||||
, word64Dec
|
, word64Dec
|
||||||
|
, word16Dec
|
||||||
, int64Dec
|
, int64Dec
|
||||||
, word64PaddedUpperHex
|
, word64PaddedUpperHex
|
||||||
, word32PaddedUpperHex
|
, word32PaddedUpperHex
|
||||||
|
@ -50,7 +53,12 @@ import GHC.ST
|
||||||
import GHC.Word
|
import GHC.Word
|
||||||
import GHC.TypeLits (KnownNat,natVal')
|
import GHC.TypeLits (KnownNat,natVal')
|
||||||
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
|
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
|
||||||
|
import Data.ByteString.Short.Internal (ShortByteString(SBS))
|
||||||
|
import Data.Text.Short (ShortText)
|
||||||
|
import Data.Char (ord)
|
||||||
|
|
||||||
|
import qualified GHC.Exts as Exts
|
||||||
|
import qualified Data.Text.Short as TS
|
||||||
import qualified Data.Primitive as PM
|
import qualified Data.Primitive as PM
|
||||||
import qualified Data.Vector as V
|
import qualified Data.Vector as V
|
||||||
import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
|
import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
|
||||||
|
@ -191,12 +199,73 @@ bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len
|
||||||
pure (Just (off + slen))
|
pure (Just (off + slen))
|
||||||
else pure Nothing
|
else pure Nothing
|
||||||
|
|
||||||
|
-- Internal function. Precondition, the referenced slice of the
|
||||||
|
-- byte sequence is UTF-8 encoded text.
|
||||||
|
slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder
|
||||||
|
{-# inline slicedUtf8TextJson #-}
|
||||||
|
slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 dlen0) ->
|
||||||
|
let slen0 = I# slen0#
|
||||||
|
in if dlen0 > (2 * slen0) + 2
|
||||||
|
then do
|
||||||
|
PM.writeByteArray dst doff0 (c2w '"')
|
||||||
|
let go !soff !slen !doff = if slen > 0
|
||||||
|
then case indexChar8Array (ByteArray src#) soff of
|
||||||
|
'\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||||
|
'\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||||
|
'\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||||
|
'\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||||
|
'\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2)
|
||||||
|
c -> if c >= '\x20'
|
||||||
|
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
|
||||||
|
else do
|
||||||
|
write2 dst doff '\\' 'u'
|
||||||
|
doff' <- Unsafe.pasteST
|
||||||
|
(Unsafe.word16PaddedUpperHex (fromIntegral (c2w c)))
|
||||||
|
dst (doff + 2)
|
||||||
|
go (soff + 1) (slen - 1) doff'
|
||||||
|
else pure doff
|
||||||
|
doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1)
|
||||||
|
PM.writeByteArray dst doffRes (c2w '"')
|
||||||
|
pure (Just (doffRes + 1))
|
||||||
|
else pure Nothing
|
||||||
|
|
||||||
|
-- Internal. Write two characters in the ASCII plane to a byte array.
|
||||||
|
write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s ()
|
||||||
|
write2 marr ix a b = do
|
||||||
|
PM.writeByteArray marr ix (c2w a)
|
||||||
|
PM.writeByteArray marr (ix + 1) (c2w b)
|
||||||
|
|
||||||
|
-- | Create a builder from text. The text will be UTF-8 encoded.
|
||||||
|
shortTextUtf8 :: ShortText -> Builder
|
||||||
|
shortTextUtf8 a =
|
||||||
|
let ba = shortTextToByteArray a
|
||||||
|
in bytes (Bytes ba 0 (sizeofByteArray ba))
|
||||||
|
|
||||||
|
-- | Create a builder from text. The text will be UTF-8 encoded,
|
||||||
|
-- and JSON special characters will be escaped. Additionally, the
|
||||||
|
-- result is surrounded by double quotes. For example:
|
||||||
|
--
|
||||||
|
-- * @foo ==> "foo"@
|
||||||
|
-- * @\_"_/ ==> "\\_\"_/"@
|
||||||
|
-- * @hello<ESC>world ==> "hello\u001Bworld"@ (where <LF> is code point 0x1B)
|
||||||
|
shortTextJsonString :: ShortText -> Builder
|
||||||
|
shortTextJsonString a =
|
||||||
|
let !(ByteArray ba) = shortTextToByteArray a
|
||||||
|
!(I# len) = PM.sizeofByteArray (ByteArray ba)
|
||||||
|
in slicedUtf8TextJson ba 0# len
|
||||||
|
|
||||||
-- | Encodes an unsigned 64-bit integer as decimal.
|
-- | Encodes an unsigned 64-bit integer as decimal.
|
||||||
-- This encoding never starts with a zero unless the
|
-- This encoding never starts with a zero unless the
|
||||||
-- argument was zero.
|
-- argument was zero.
|
||||||
word64Dec :: Word64 -> Builder
|
word64Dec :: Word64 -> Builder
|
||||||
word64Dec w = fromUnsafe (Unsafe.word64Dec w)
|
word64Dec w = fromUnsafe (Unsafe.word64Dec w)
|
||||||
|
|
||||||
|
-- | Encodes an unsigned 16-bit integer as decimal.
|
||||||
|
-- This encoding never starts with a zero unless the
|
||||||
|
-- argument was zero.
|
||||||
|
word16Dec :: Word16 -> Builder
|
||||||
|
word16Dec w = fromUnsafe (Unsafe.word16Dec w)
|
||||||
|
|
||||||
-- | Encode a double-floating-point number, using decimal notation or
|
-- | Encode a double-floating-point number, using decimal notation or
|
||||||
-- scientific notation depending on the magnitude. This has undefined
|
-- scientific notation depending on the magnitude. This has undefined
|
||||||
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
|
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
|
||||||
|
@ -260,3 +329,14 @@ word32BE w = fromUnsafe (Unsafe.word32BE w)
|
||||||
-- word in a big-endian fashion.
|
-- word in a big-endian fashion.
|
||||||
word16BE :: Word16 -> Builder
|
word16BE :: Word16 -> Builder
|
||||||
word16BE w = fromUnsafe (Unsafe.word16BE w)
|
word16BE w = fromUnsafe (Unsafe.word16BE w)
|
||||||
|
|
||||||
|
-- ShortText is already UTF-8 encoded. This is a no-op.
|
||||||
|
shortTextToByteArray :: ShortText -> ByteArray
|
||||||
|
shortTextToByteArray x = case TS.toShortByteString x of
|
||||||
|
SBS a -> ByteArray a
|
||||||
|
|
||||||
|
indexChar8Array :: ByteArray -> Int -> Char
|
||||||
|
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
|
||||||
|
|
||||||
|
c2w :: Char -> Word8
|
||||||
|
c2w = fromIntegral . ord
|
||||||
|
|
|
@ -26,6 +26,7 @@ module Data.ByteArray.Builder.Small.Unsafe
|
||||||
-- * Encode Integral Types
|
-- * Encode Integral Types
|
||||||
-- ** Human-Readable
|
-- ** Human-Readable
|
||||||
, word64Dec
|
, word64Dec
|
||||||
|
, word16Dec
|
||||||
, int64Dec
|
, int64Dec
|
||||||
, word64PaddedUpperHex
|
, word64PaddedUpperHex
|
||||||
, word32PaddedUpperHex
|
, word32PaddedUpperHex
|
||||||
|
@ -139,7 +140,12 @@ doubleDec (D# d) = Builder (\arr off0 s0 -> doubleDec# d arr off0 s0)
|
||||||
-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
|
-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
|
||||||
-- This encoding never starts with a zero unless the argument was zero.
|
-- This encoding never starts with a zero unless the argument was zero.
|
||||||
word64Dec :: Word64 -> Builder 19
|
word64Dec :: Word64 -> Builder 19
|
||||||
word64Dec (W64# w) = word64Dec# w
|
word64Dec (W64# w) = wordCommonDec# w
|
||||||
|
|
||||||
|
-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
|
||||||
|
-- This encoding never starts with a zero unless the argument was zero.
|
||||||
|
word16Dec :: Word16 -> Builder 5
|
||||||
|
word16Dec (W16# w) = wordCommonDec# w
|
||||||
|
|
||||||
-- | Requires up to 20 bytes. Encodes a signed 64-bit integer as decimal.
|
-- | Requires up to 20 bytes. Encodes a signed 64-bit integer as decimal.
|
||||||
-- This encoding never starts with a zero unless the argument was zero.
|
-- This encoding never starts with a zero unless the argument was zero.
|
||||||
|
@ -148,10 +154,11 @@ word64Dec (W64# w) = word64Dec# w
|
||||||
int64Dec :: Int64 -> Builder 20
|
int64Dec :: Int64 -> Builder 20
|
||||||
int64Dec (I64# w) = int64Dec# w
|
int64Dec (I64# w) = int64Dec# w
|
||||||
|
|
||||||
-- | Requires up to 19 bytes.
|
-- Requires a number of bytes that is bounded by the size of
|
||||||
word64Dec# :: Word# -> Builder 19
|
-- the word. This is only used internally.
|
||||||
{-# noinline word64Dec# #-}
|
wordCommonDec# :: Word# -> Builder n
|
||||||
word64Dec# w# = construct $ \arr off0 -> if w /= 0
|
{-# noinline wordCommonDec# #-}
|
||||||
|
wordCommonDec# w# = construct $ \arr off0 -> if w /= 0
|
||||||
then internalWordLoop arr off0 (W# w#)
|
then internalWordLoop arr off0 (W# w#)
|
||||||
else do
|
else do
|
||||||
writeByteArray arr off0 (c2w '0')
|
writeByteArray arr off0 (c2w '0')
|
||||||
|
@ -353,7 +360,7 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
|
||||||
-- This is adapted from androider's code in https://stackoverflow.com/a/7097567
|
-- This is adapted from androider's code in https://stackoverflow.com/a/7097567
|
||||||
-- The checks for infinity and NaN have been removed. Note that this is a little
|
-- The checks for infinity and NaN have been removed. Note that this is a little
|
||||||
-- inaccurate. This is very visible when encoding a number like 2.25, which
|
-- inaccurate. This is very visible when encoding a number like 2.25, which
|
||||||
-- is perfectly represented as a IEEE 754 floating point number but is goofed
|
-- is perfectly represented as an IEEE 754 floating point number but is goofed
|
||||||
-- up by this function.
|
-- up by this function.
|
||||||
-- If you modify this function, please take a took at the resulting core.
|
-- If you modify this function, please take a took at the resulting core.
|
||||||
-- It currently performs no boxing at all, and it would be nice to keep
|
-- It currently performs no boxing at all, and it would be nice to keep
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
{-# language BangPatterns #-}
|
{-# language BangPatterns #-}
|
||||||
{-# language ScopedTypeVariables #-}
|
{-# language ScopedTypeVariables #-}
|
||||||
{-# language TypeApplications #-}
|
{-# language TypeApplications #-}
|
||||||
|
{-# language OverloadedStrings #-}
|
||||||
|
|
||||||
import Control.Monad.ST (runST)
|
import Control.Monad.ST (runST)
|
||||||
import Data.Bytes.Types (MutableBytes(..))
|
import Data.Bytes.Types (MutableBytes(..))
|
||||||
|
@ -75,6 +76,14 @@ tests = testGroup "Tests"
|
||||||
pack ("999999999") @=? run 1 (doubleDec 999999999)
|
pack ("999999999") @=? run 1 (doubleDec 999999999)
|
||||||
, THU.testCase "doubleDec-K" $
|
, THU.testCase "doubleDec-K" $
|
||||||
pack ("-99999999") @=? run 1 (doubleDec (-99999999))
|
pack ("-99999999") @=? run 1 (doubleDec (-99999999))
|
||||||
|
, THU.testCase "shortTextJsonString-A" $
|
||||||
|
pack ("\"hello\"") @=? run 1 (shortTextJsonString "hello")
|
||||||
|
, THU.testCase "shortTextJsonString-B" $
|
||||||
|
pack ("\"\\\\_\\\"_/\"") @=? run 1 (shortTextJsonString "\\_\"_/")
|
||||||
|
, THU.testCase "shortTextJsonString-C" $
|
||||||
|
pack ("\"Hi\\r\\nLo\"") @=? run 1 (shortTextJsonString "Hi\r\nLo")
|
||||||
|
, THU.testCase "shortTextJsonString-D" $
|
||||||
|
pack ("\"Hi\\u001BLo\"") @=? run 1 (shortTextJsonString "Hi\ESCLo")
|
||||||
]
|
]
|
||||||
, testGroup "alternate"
|
, testGroup "alternate"
|
||||||
[ TQC.testProperty "HexWord64" $ \x y ->
|
[ TQC.testProperty "HexWord64" $ \x y ->
|
||||||
|
|
Loading…
Reference in a new issue