diff --git a/CHANGELOG.md b/CHANGELOG.md index 611ed1e..6520396 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ * Add a `pasteGrowST` for length-indexed builders. * Add function for rendering floating-point numbers in a slightly inaccurate way. +* Add `word16Dec`. +* Add functions for encoding `ShortText` as UTF-8 and as a JSON string. ## 0.1.1.0 -- 2019-07-30 diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal index c0a17c5..8fe031a 100644 --- a/small-bytearray-builder.cabal +++ b/small-bytearray-builder.cabal @@ -44,6 +44,8 @@ library , primitive-offset >=0.2 && <0.3 , run-st >=0.1 && <0.2 , vector >=0.12.0.3 && <0.13 + , bytestring >=0.10.8.2 && <0.11 + , text-short >=0.1.3 && <0.2 if flag(checked) build-depends: primitive-checked >= 0.7 && <0.8 else diff --git a/src/Data/ByteArray/Builder/Small.hs b/src/Data/ByteArray/Builder/Small.hs index 152e60e..70e0bbb 100644 --- a/src/Data/ByteArray/Builder/Small.hs +++ b/src/Data/ByteArray/Builder/Small.hs @@ -22,9 +22,12 @@ module Data.ByteArray.Builder.Small -- * Materialized Byte Sequences , bytes , bytearray + , shortTextUtf8 + , shortTextJsonString -- * Encode Integral Types -- ** Human-Readable , word64Dec + , word16Dec , int64Dec , word64PaddedUpperHex , word32PaddedUpperHex @@ -50,7 +53,12 @@ import GHC.ST import GHC.Word import GHC.TypeLits (KnownNat,natVal') import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) +import Data.ByteString.Short.Internal (ShortByteString(SBS)) +import Data.Text.Short (ShortText) +import Data.Char (ord) +import qualified GHC.Exts as Exts +import qualified Data.Text.Short as TS import qualified Data.Primitive as PM import qualified Data.Vector as V import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe @@ -191,12 +199,73 @@ bytes (Bytes src soff slen) = construct $ \(MutableBytes arr off len) -> if len pure (Just (off + slen)) else pure Nothing +-- Internal function. Precondition, the referenced slice of the +-- byte sequence is UTF-8 encoded text. +slicedUtf8TextJson :: ByteArray# -> Int# -> Int# -> Builder +{-# inline slicedUtf8TextJson #-} +slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 dlen0) -> + let slen0 = I# slen0# + in if dlen0 > (2 * slen0) + 2 + then do + PM.writeByteArray dst doff0 (c2w '"') + let go !soff !slen !doff = if slen > 0 + then case indexChar8Array (ByteArray src#) soff of + '\\' -> write2 dst doff '\\' '\\' *> go (soff + 1) (slen - 1) (doff + 2) + '\"' -> write2 dst doff '\\' '\"' *> go (soff + 1) (slen - 1) (doff + 2) + '\n' -> write2 dst doff '\\' 'n' *> go (soff + 1) (slen - 1) (doff + 2) + '\r' -> write2 dst doff '\\' 'r' *> go (soff + 1) (slen - 1) (doff + 2) + '\t' -> write2 dst doff '\\' 't' *> go (soff + 1) (slen - 1) (doff + 2) + c -> if c >= '\x20' + then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1) + else do + write2 dst doff '\\' 'u' + doff' <- Unsafe.pasteST + (Unsafe.word16PaddedUpperHex (fromIntegral (c2w c))) + dst (doff + 2) + go (soff + 1) (slen - 1) doff' + else pure doff + doffRes <- go (I# soff0#) (I# slen0#) (doff0 + 1) + PM.writeByteArray dst doffRes (c2w '"') + pure (Just (doffRes + 1)) + else pure Nothing + +-- Internal. Write two characters in the ASCII plane to a byte array. +write2 :: MutableByteArray s -> Int -> Char -> Char -> ST s () +write2 marr ix a b = do + PM.writeByteArray marr ix (c2w a) + PM.writeByteArray marr (ix + 1) (c2w b) + +-- | Create a builder from text. The text will be UTF-8 encoded. +shortTextUtf8 :: ShortText -> Builder +shortTextUtf8 a = + let ba = shortTextToByteArray a + in bytes (Bytes ba 0 (sizeofByteArray ba)) + +-- | Create a builder from text. The text will be UTF-8 encoded, +-- and JSON special characters will be escaped. Additionally, the +-- result is surrounded by double quotes. For example: +-- +-- * @foo ==> "foo"@ +-- * @\_"_/ ==> "\\_\"_/"@ +-- * @helloworld ==> "hello\u001Bworld"@ (where is code point 0x1B) +shortTextJsonString :: ShortText -> Builder +shortTextJsonString a = + let !(ByteArray ba) = shortTextToByteArray a + !(I# len) = PM.sizeofByteArray (ByteArray ba) + in slicedUtf8TextJson ba 0# len + -- | Encodes an unsigned 64-bit integer as decimal. -- This encoding never starts with a zero unless the -- argument was zero. word64Dec :: Word64 -> Builder word64Dec w = fromUnsafe (Unsafe.word64Dec w) +-- | Encodes an unsigned 16-bit integer as decimal. +-- This encoding never starts with a zero unless the +-- argument was zero. +word16Dec :: Word16 -> Builder +word16Dec w = fromUnsafe (Unsafe.word16Dec w) + -- | Encode a double-floating-point number, using decimal notation or -- scientific notation depending on the magnitude. This has undefined -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not @@ -260,3 +329,14 @@ word32BE w = fromUnsafe (Unsafe.word32BE w) -- word in a big-endian fashion. word16BE :: Word16 -> Builder word16BE w = fromUnsafe (Unsafe.word16BE w) + +-- ShortText is already UTF-8 encoded. This is a no-op. +shortTextToByteArray :: ShortText -> ByteArray +shortTextToByteArray x = case TS.toShortByteString x of + SBS a -> ByteArray a + +indexChar8Array :: ByteArray -> Int -> Char +indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i) + +c2w :: Char -> Word8 +c2w = fromIntegral . ord diff --git a/src/Data/ByteArray/Builder/Small/Unsafe.hs b/src/Data/ByteArray/Builder/Small/Unsafe.hs index 7f7f4c9..84df3a9 100644 --- a/src/Data/ByteArray/Builder/Small/Unsafe.hs +++ b/src/Data/ByteArray/Builder/Small/Unsafe.hs @@ -26,6 +26,7 @@ module Data.ByteArray.Builder.Small.Unsafe -- * Encode Integral Types -- ** Human-Readable , word64Dec + , word16Dec , int64Dec , word64PaddedUpperHex , word32PaddedUpperHex @@ -139,7 +140,12 @@ doubleDec (D# d) = Builder (\arr off0 s0 -> doubleDec# d arr off0 s0) -- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal. -- This encoding never starts with a zero unless the argument was zero. word64Dec :: Word64 -> Builder 19 -word64Dec (W64# w) = word64Dec# w +word64Dec (W64# w) = wordCommonDec# w + +-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal. +-- This encoding never starts with a zero unless the argument was zero. +word16Dec :: Word16 -> Builder 5 +word16Dec (W16# w) = wordCommonDec# w -- | Requires up to 20 bytes. Encodes a signed 64-bit integer as decimal. -- This encoding never starts with a zero unless the argument was zero. @@ -148,10 +154,11 @@ word64Dec (W64# w) = word64Dec# w int64Dec :: Int64 -> Builder 20 int64Dec (I64# w) = int64Dec# w --- | Requires up to 19 bytes. -word64Dec# :: Word# -> Builder 19 -{-# noinline word64Dec# #-} -word64Dec# w# = construct $ \arr off0 -> if w /= 0 +-- Requires a number of bytes that is bounded by the size of +-- the word. This is only used internally. +wordCommonDec# :: Word# -> Builder n +{-# noinline wordCommonDec# #-} +wordCommonDec# w# = construct $ \arr off0 -> if w /= 0 then internalWordLoop arr off0 (W# w#) else do writeByteArray arr off0 (c2w '0') @@ -353,7 +360,7 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) = -- This is adapted from androider's code in https://stackoverflow.com/a/7097567 -- The checks for infinity and NaN have been removed. Note that this is a little -- inaccurate. This is very visible when encoding a number like 2.25, which --- is perfectly represented as a IEEE 754 floating point number but is goofed +-- is perfectly represented as an IEEE 754 floating point number but is goofed -- up by this function. -- If you modify this function, please take a took at the resulting core. -- It currently performs no boxing at all, and it would be nice to keep diff --git a/test/Main.hs b/test/Main.hs index 1169a97..649711e 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -1,6 +1,7 @@ {-# language BangPatterns #-} {-# language ScopedTypeVariables #-} {-# language TypeApplications #-} +{-# language OverloadedStrings #-} import Control.Monad.ST (runST) import Data.Bytes.Types (MutableBytes(..)) @@ -75,6 +76,14 @@ tests = testGroup "Tests" pack ("999999999") @=? run 1 (doubleDec 999999999) , THU.testCase "doubleDec-K" $ pack ("-99999999") @=? run 1 (doubleDec (-99999999)) + , THU.testCase "shortTextJsonString-A" $ + pack ("\"hello\"") @=? run 1 (shortTextJsonString "hello") + , THU.testCase "shortTextJsonString-B" $ + pack ("\"\\\\_\\\"_/\"") @=? run 1 (shortTextJsonString "\\_\"_/") + , THU.testCase "shortTextJsonString-C" $ + pack ("\"Hi\\r\\nLo\"") @=? run 1 (shortTextJsonString "Hi\r\nLo") + , THU.testCase "shortTextJsonString-D" $ + pack ("\"Hi\\u001BLo\"") @=? run 1 (shortTextJsonString "Hi\ESCLo") ] , testGroup "alternate" [ TQC.testProperty "HexWord64" $ \x y ->