switch unsafe to bounded + char encoding

This commit is contained in:
Alice McKean 2019-09-01 22:43:54 -07:00
parent 22dfde2936
commit 9a14600a38
3 changed files with 152 additions and 29 deletions

View file

@ -37,7 +37,7 @@ flag checked
library
exposed-modules:
Data.ByteArray.Builder.Small
Data.ByteArray.Builder.Small.Unsafe
Data.ByteArray.Builder.Small.Bounded
build-depends:
, base >=4.12.0.0 && <5
, byteslice >=0.1 && <0.2

View file

@ -7,10 +7,10 @@
{-# language UnboxedTuples #-}
module Data.ByteArray.Builder.Small
( -- * Unsafe Primitives
( -- * Bounded Primitives
Builder(..)
, construct
, fromUnsafe
, fromBounded
-- * Evaluation
, run
, pasteST
@ -34,10 +34,13 @@ module Data.ByteArray.Builder.Small
, word32PaddedUpperHex
, word16PaddedUpperHex
, word8PaddedUpperHex
, ascii
, char
-- ** Machine-Readable
, word64BE
, word32BE
, word16BE
, word8
-- * Encode Floating-Point Types
-- ** Human-Readable
, doubleDec
@ -62,7 +65,7 @@ import qualified GHC.Exts as Exts
import qualified Data.Text.Short as TS
import qualified Data.Primitive as PM
import qualified Data.Vector as V
import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
import qualified Data.ByteArray.Builder.Small.Bounded as Bounded
-- | An unmaterialized sequence of bytes that may be pasted
-- into a mutable byte array.
@ -183,9 +186,9 @@ construct f = Builder
Nothing -> (# s1, (-1#) #)
Just (I# n) -> (# s1, n #)
fromUnsafe :: forall n. KnownNat n => Unsafe.Builder n -> Builder
{-# inline fromUnsafe #-}
fromUnsafe (Unsafe.Builder f) = Builder $ \arr off len s0 ->
fromBounded :: forall n. KnownNat n => Bounded.Builder n -> Builder
{-# inline fromBounded #-}
fromBounded (Bounded.Builder f) = Builder $ \arr off len s0 ->
case fromIntegral (natVal' (proxy# :: Proxy# n)) of
I# req -> case len >=# req of
1# -> f arr off s0
@ -223,8 +226,8 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
else do
write2 dst doff '\\' 'u'
doff' <- Unsafe.pasteST
(Unsafe.word16PaddedUpperHex (fromIntegral (c2w c)))
doff' <- Bounded.pasteST
(Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
dst (doff + 2)
go (soff + 1) (slen - 1) doff'
else pure doff
@ -262,61 +265,70 @@ shortTextJsonString a =
-- This encoding never starts with a zero unless the
-- argument was zero.
word64Dec :: Word64 -> Builder
word64Dec w = fromUnsafe (Unsafe.word64Dec w)
word64Dec w = fromBounded (Bounded.word64Dec w)
-- | Encodes an unsigned 16-bit integer as decimal.
-- This encoding never starts with a zero unless the
-- argument was zero.
word32Dec :: Word32 -> Builder
word32Dec w = fromUnsafe (Unsafe.word32Dec w)
word32Dec w = fromBounded (Bounded.word32Dec w)
-- | Encodes an unsigned 16-bit integer as decimal.
-- This encoding never starts with a zero unless the
-- argument was zero.
word16Dec :: Word16 -> Builder
word16Dec w = fromUnsafe (Unsafe.word16Dec w)
word16Dec w = fromBounded (Bounded.word16Dec w)
-- | Encode a double-floating-point number, using decimal notation or
-- scientific notation depending on the magnitude. This has undefined
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
-- crash, but the generated numbers will be nonsense.
doubleDec :: Double -> Builder
doubleDec w = fromUnsafe (Unsafe.doubleDec w)
doubleDec w = fromBounded (Bounded.doubleDec w)
-- | Encodes a signed 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int64Dec :: Int64 -> Builder
int64Dec w = fromUnsafe (Unsafe.int64Dec w)
int64Dec w = fromBounded (Bounded.int64Dec w)
-- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 16 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 1022 as @00000000000003FE@.
word64PaddedUpperHex :: Word64 -> Builder
word64PaddedUpperHex w =
fromUnsafe (Unsafe.word64PaddedUpperHex w)
fromBounded (Bounded.word64PaddedUpperHex w)
-- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 8 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 1022 as @000003FE@.
word32PaddedUpperHex :: Word32 -> Builder
word32PaddedUpperHex w =
fromUnsafe (Unsafe.word32PaddedUpperHex w)
fromBounded (Bounded.word32PaddedUpperHex w)
-- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 4 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 1022 as @03FE@.
word16PaddedUpperHex :: Word16 -> Builder
word16PaddedUpperHex w =
fromUnsafe (Unsafe.word16PaddedUpperHex w)
fromBounded (Bounded.word16PaddedUpperHex w)
-- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 2 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 11 as @0B@.
word8PaddedUpperHex :: Word8 -> Builder
word8PaddedUpperHex w =
fromUnsafe (Unsafe.word8PaddedUpperHex w)
fromBounded (Bounded.word8PaddedUpperHex w)
-- | Encode an ASCII char.
-- Precondition: Input must be an ASCII character. This is not checked.
ascii :: Char -> Builder
ascii c = fromBounded (Bounded.char c)
-- | Encode an UTF8 char. This only uses as much space as is required.
char :: Char -> Builder
char c = fromBounded (Bounded.char c)
unST :: ST s a -> State# s -> (# State# s, a #)
unST (ST f) = f
@ -328,17 +340,20 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
-- | Requires exactly 8 bytes. Dump the octets of a 64-bit
-- word in a big-endian fashion.
word64BE :: Word64 -> Builder
word64BE w = fromUnsafe (Unsafe.word64BE w)
word64BE w = fromBounded (Bounded.word64BE w)
-- | Requires exactly 4 bytes. Dump the octets of a 32-bit
-- word in a big-endian fashion.
word32BE :: Word32 -> Builder
word32BE w = fromUnsafe (Unsafe.word32BE w)
word32BE w = fromBounded (Bounded.word32BE w)
-- | Requires exactly 2 bytes. Dump the octets of a 16-bit
-- word in a big-endian fashion.
word16BE :: Word16 -> Builder
word16BE w = fromUnsafe (Unsafe.word16BE w)
word16BE w = fromBounded (Bounded.word16BE w)
word8 :: Word8 -> Builder
word8 w = fromBounded (Bounded.word8 w)
-- ShortText is already UTF-8 encoded. This is a no-op.
shortTextToByteArray :: ShortText -> ByteArray
@ -349,4 +364,4 @@ indexChar8Array :: ByteArray -> Int -> Char
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
c2w :: Char -> Word8
c2w = fromIntegral . ord
c2w = fromIntegral . ord

View file

@ -1,8 +1,9 @@
{-# language GADTSyntax #-}
{-# language GADTs #-}
{-# language KindSignatures #-}
{-# language ScopedTypeVariables #-}
{-# language BangPatterns #-}
{-# language MagicHash #-}
{-# language BinaryLiterals #-}
{-# language UnboxedTuples #-}
{-# language RankNTypes #-}
{-# language LambdaCase #-}
@ -10,9 +11,8 @@
{-# language DataKinds #-}
{-# language TypeApplications #-}
-- | The functions in this module do not check to
-- see if there is enough space in the buffer.
module Data.ByteArray.Builder.Small.Unsafe
-- | The functions in this module are explict in the amount of bytes they require.
module Data.ByteArray.Builder.Small.Bounded
( -- * Builder
Builder(..)
, construct
@ -23,6 +23,11 @@ module Data.ByteArray.Builder.Small.Unsafe
, pasteIO
-- * Combine
, append
-- * Bounds Manipulation
, (<=)
, lessThanEqual
, isLessThanEqual
, raise
-- * Encode Integral Types
-- ** Human-Readable
, word64Dec
@ -33,6 +38,8 @@ module Data.ByteArray.Builder.Small.Unsafe
, word32PaddedUpperHex
, word16PaddedUpperHex
, word8PaddedUpperHex
, ascii
, char
-- ** Machine-Readable
, word64BE
, word32BE
@ -53,8 +60,9 @@ import GHC.Word
import GHC.Int
import Data.Kind
import GHC.TypeLits (KnownNat,Nat,type (+),natVal')
import qualified GHC.TypeLits as GHC
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
import Control.Monad (when)
import qualified Control.Category as Cat
import qualified Data.Primitive as PM
@ -65,13 +73,16 @@ newtype Builder :: Nat -> Type where
(forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #))
-> Builder n
knownNat :: KnownNat n => Proxy# n -> Int
knownNat p = fromIntegral (natVal' p)
-- | Execute the builder. This function is safe.
run :: forall n. KnownNat n
=> Builder n -- ^ Builder
-> ByteArray
{-# inline run #-}
run b = runST $ do
arr <- newByteArray (fromIntegral (natVal' (proxy# :: Proxy# n)))
arr <- newByteArray (knownNat (proxy# :: Proxy# n))
len <- pasteST b arr 0
shrinkMutableByteArray arr len
unsafeFreezeByteArray arr
@ -96,7 +107,7 @@ pasteGrowST :: forall n s. KnownNat n
{-# inline pasteGrowST #-}
pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do
sz0 <- PM.getSizeofMutableByteArray arr0
let req = fromIntegral (natVal' (proxy# :: Proxy# n))
let req = knownNat (proxy# :: Proxy# n)
let sz1 = off0 + req
if sz1 <= sz0
then do
@ -131,6 +142,33 @@ append (Builder f) (Builder g) =
Builder $ \arr off0 s0 -> case f arr off0 s0 of
(# s1, r #) -> g arr r s1
-- | A proof that n is less than or equal to m
newtype (n :: Nat) <= (m :: Nat) = LessThanEqual Int -- m - n
instance Cat.Category (<=) where
id = LessThanEqual 0
-- b <= c (c - b) -> a <= b (b - a) -> a <= c (c - a)
LessThanEqual cb . LessThanEqual ba = LessThanEqual (cb + ba)
-- | Dynamically check than 'n' is less than or equal to 'm'
isLessThanEqual :: (KnownNat n, KnownNat m) => Proxy# n -> Proxy# m -> Maybe (n <= m)
isLessThanEqual n m = if 0 <= diff then Just (LessThanEqual diff) else Nothing
where diff = knownNat m - knownNat n
-- | Statically check than 'n' is less than or equal to 'm'. 'n' and 'm' must be known at compile time.
lessThanEqual :: forall n m. (KnownNat n, KnownNat m, n GHC.<= m) => n <= m
lessThanEqual = LessThanEqual (knownNat (proxy# :: Proxy# m) - knownNat (proxy# :: Proxy# n))
-- | Weaken the bound on the maximum number of bytes required.
-- >>> :{
-- buildNumber :: Either Double Word64 -> Builder 32
-- buildNumber = \case
-- Left d -> doubleDec d
-- Right w -> raise lessThanEqual (word64Dec w)
-- :}
raise :: forall m n. n <= m -> Builder n -> Builder m
raise !_ (Builder f) = Builder f
-- | Encode a double-floating-point number, using decimal notation or
-- scientific notation depending on the magnitude. This has undefined
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
@ -302,6 +340,76 @@ word8PaddedUpperHex# w# = construct $ \arr off -> do
where
w = W# w#
-- | Encode an ASCII char.
-- Precondition: Input must be an ASCII character. This is not checked.
ascii :: Char -> Builder 1
ascii c = word8 (fromIntegral @Int @Word8 (ord c))
-- | Encode an UTF8 char. This only uses as much space as is required.
char :: Char -> Builder 4
char c
| codepoint < 0x80 = construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 codepoint)
pure (off + 1)
| codepoint < 0x800 = construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 (byteTwoOne codepoint))
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint))
return (off + 2)
| codepoint >= 0xD800 && codepoint < 0xE000 = construct $ \arr off -> do
-- Codepoint U+FFFD
writeByteArray arr off (0xEF :: Word8)
writeByteArray arr (off + 1) (0xBF :: Word8)
writeByteArray arr (off + 2) (0xBD :: Word8)
return (off + 3)
| codepoint < 0x10000 = construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 (byteThreeOne codepoint))
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint))
writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint))
return (off + 3)
| otherwise = construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 (byteFourOne codepoint))
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint))
writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint))
writeByteArray arr (off + 3) (unsafeWordToWord8 (byteFourFour codepoint))
return (off + 4)
where
codepoint :: Word
codepoint = fromIntegral (ord c)
unsafeWordToWord8 :: Word -> Word8
unsafeWordToWord8 (W# w) = W8# w
-- precondition: codepoint is less than 0x800
byteTwoOne :: Word -> Word
byteTwoOne w = unsafeShiftR w 6 .|. 0b11000000
byteTwoTwo :: Word -> Word
byteTwoTwo w = (w .&. 0b00111111) .|. 0b10000000
-- precondition: codepoint is less than 0x1000
byteThreeOne :: Word -> Word
byteThreeOne w = unsafeShiftR w 12 .|. 0b11100000
byteThreeTwo :: Word -> Word
byteThreeTwo w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
byteThreeThree :: Word -> Word
byteThreeThree w = (w .&. 0b00111111) .|. 0b10000000
-- precondition: codepoint is less than 0x110000
byteFourOne :: Word -> Word
byteFourOne w = unsafeShiftR w 18 .|. 0b11110000
byteFourTwo :: Word -> Word
byteFourTwo w = (0b00111111 .&. unsafeShiftR w 12) .|. 0b10000000
byteFourThree :: Word -> Word
byteFourThree w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
byteFourFour :: Word -> Word
byteFourFour w = (0b00111111 .&. w) .|. 0b10000000
-- | Requires exactly 8 bytes. Dump the octets of a 64-bit
-- word in a big-endian fashion.
word64BE :: Word64 -> Builder 8