switch unsafe to bounded + char encoding

This commit is contained in:
Alice McKean 2019-09-01 22:43:54 -07:00
parent 22dfde2936
commit 9a14600a38
3 changed files with 152 additions and 29 deletions

View file

@ -37,7 +37,7 @@ flag checked
library library
exposed-modules: exposed-modules:
Data.ByteArray.Builder.Small Data.ByteArray.Builder.Small
Data.ByteArray.Builder.Small.Unsafe Data.ByteArray.Builder.Small.Bounded
build-depends: build-depends:
, base >=4.12.0.0 && <5 , base >=4.12.0.0 && <5
, byteslice >=0.1 && <0.2 , byteslice >=0.1 && <0.2

View file

@ -7,10 +7,10 @@
{-# language UnboxedTuples #-} {-# language UnboxedTuples #-}
module Data.ByteArray.Builder.Small module Data.ByteArray.Builder.Small
( -- * Unsafe Primitives ( -- * Bounded Primitives
Builder(..) Builder(..)
, construct , construct
, fromUnsafe , fromBounded
-- * Evaluation -- * Evaluation
, run , run
, pasteST , pasteST
@ -34,10 +34,13 @@ module Data.ByteArray.Builder.Small
, word32PaddedUpperHex , word32PaddedUpperHex
, word16PaddedUpperHex , word16PaddedUpperHex
, word8PaddedUpperHex , word8PaddedUpperHex
, ascii
, char
-- ** Machine-Readable -- ** Machine-Readable
, word64BE , word64BE
, word32BE , word32BE
, word16BE , word16BE
, word8
-- * Encode Floating-Point Types -- * Encode Floating-Point Types
-- ** Human-Readable -- ** Human-Readable
, doubleDec , doubleDec
@ -62,7 +65,7 @@ import qualified GHC.Exts as Exts
import qualified Data.Text.Short as TS import qualified Data.Text.Short as TS
import qualified Data.Primitive as PM import qualified Data.Primitive as PM
import qualified Data.Vector as V import qualified Data.Vector as V
import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe import qualified Data.ByteArray.Builder.Small.Bounded as Bounded
-- | An unmaterialized sequence of bytes that may be pasted -- | An unmaterialized sequence of bytes that may be pasted
-- into a mutable byte array. -- into a mutable byte array.
@ -183,9 +186,9 @@ construct f = Builder
Nothing -> (# s1, (-1#) #) Nothing -> (# s1, (-1#) #)
Just (I# n) -> (# s1, n #) Just (I# n) -> (# s1, n #)
fromUnsafe :: forall n. KnownNat n => Unsafe.Builder n -> Builder fromBounded :: forall n. KnownNat n => Bounded.Builder n -> Builder
{-# inline fromUnsafe #-} {-# inline fromBounded #-}
fromUnsafe (Unsafe.Builder f) = Builder $ \arr off len s0 -> fromBounded (Bounded.Builder f) = Builder $ \arr off len s0 ->
case fromIntegral (natVal' (proxy# :: Proxy# n)) of case fromIntegral (natVal' (proxy# :: Proxy# n)) of
I# req -> case len >=# req of I# req -> case len >=# req of
1# -> f arr off s0 1# -> f arr off s0
@ -223,8 +226,8 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1) then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
else do else do
write2 dst doff '\\' 'u' write2 dst doff '\\' 'u'
doff' <- Unsafe.pasteST doff' <- Bounded.pasteST
(Unsafe.word16PaddedUpperHex (fromIntegral (c2w c))) (Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
dst (doff + 2) dst (doff + 2)
go (soff + 1) (slen - 1) doff' go (soff + 1) (slen - 1) doff'
else pure doff else pure doff
@ -262,61 +265,70 @@ shortTextJsonString a =
-- This encoding never starts with a zero unless the -- This encoding never starts with a zero unless the
-- argument was zero. -- argument was zero.
word64Dec :: Word64 -> Builder word64Dec :: Word64 -> Builder
word64Dec w = fromUnsafe (Unsafe.word64Dec w) word64Dec w = fromBounded (Bounded.word64Dec w)
-- | Encodes an unsigned 16-bit integer as decimal. -- | Encodes an unsigned 16-bit integer as decimal.
-- This encoding never starts with a zero unless the -- This encoding never starts with a zero unless the
-- argument was zero. -- argument was zero.
word32Dec :: Word32 -> Builder word32Dec :: Word32 -> Builder
word32Dec w = fromUnsafe (Unsafe.word32Dec w) word32Dec w = fromBounded (Bounded.word32Dec w)
-- | Encodes an unsigned 16-bit integer as decimal. -- | Encodes an unsigned 16-bit integer as decimal.
-- This encoding never starts with a zero unless the -- This encoding never starts with a zero unless the
-- argument was zero. -- argument was zero.
word16Dec :: Word16 -> Builder word16Dec :: Word16 -> Builder
word16Dec w = fromUnsafe (Unsafe.word16Dec w) word16Dec w = fromBounded (Bounded.word16Dec w)
-- | Encode a double-floating-point number, using decimal notation or -- | Encode a double-floating-point number, using decimal notation or
-- scientific notation depending on the magnitude. This has undefined -- scientific notation depending on the magnitude. This has undefined
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
-- crash, but the generated numbers will be nonsense. -- crash, but the generated numbers will be nonsense.
doubleDec :: Double -> Builder doubleDec :: Double -> Builder
doubleDec w = fromUnsafe (Unsafe.doubleDec w) doubleDec w = fromBounded (Bounded.doubleDec w)
-- | Encodes a signed 64-bit integer as decimal. -- | Encodes a signed 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero. -- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers -- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything. -- are not preceded by anything.
int64Dec :: Int64 -> Builder int64Dec :: Int64 -> Builder
int64Dec w = fromUnsafe (Unsafe.int64Dec w) int64Dec w = fromBounded (Bounded.int64Dec w)
-- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding -- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 16 digits. This uses uppercase for the alphabetical -- the encoding to 16 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 1022 as @00000000000003FE@. -- digits. For example, this encodes the number 1022 as @00000000000003FE@.
word64PaddedUpperHex :: Word64 -> Builder word64PaddedUpperHex :: Word64 -> Builder
word64PaddedUpperHex w = word64PaddedUpperHex w =
fromUnsafe (Unsafe.word64PaddedUpperHex w) fromBounded (Bounded.word64PaddedUpperHex w)
-- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding -- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 8 digits. This uses uppercase for the alphabetical -- the encoding to 8 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 1022 as @000003FE@. -- digits. For example, this encodes the number 1022 as @000003FE@.
word32PaddedUpperHex :: Word32 -> Builder word32PaddedUpperHex :: Word32 -> Builder
word32PaddedUpperHex w = word32PaddedUpperHex w =
fromUnsafe (Unsafe.word32PaddedUpperHex w) fromBounded (Bounded.word32PaddedUpperHex w)
-- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding -- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 4 digits. This uses uppercase for the alphabetical -- the encoding to 4 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 1022 as @03FE@. -- digits. For example, this encodes the number 1022 as @03FE@.
word16PaddedUpperHex :: Word16 -> Builder word16PaddedUpperHex :: Word16 -> Builder
word16PaddedUpperHex w = word16PaddedUpperHex w =
fromUnsafe (Unsafe.word16PaddedUpperHex w) fromBounded (Bounded.word16PaddedUpperHex w)
-- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding -- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding
-- the encoding to 2 digits. This uses uppercase for the alphabetical -- the encoding to 2 digits. This uses uppercase for the alphabetical
-- digits. For example, this encodes the number 11 as @0B@. -- digits. For example, this encodes the number 11 as @0B@.
word8PaddedUpperHex :: Word8 -> Builder word8PaddedUpperHex :: Word8 -> Builder
word8PaddedUpperHex w = word8PaddedUpperHex w =
fromUnsafe (Unsafe.word8PaddedUpperHex w) fromBounded (Bounded.word8PaddedUpperHex w)
-- | Encode an ASCII char.
-- Precondition: Input must be an ASCII character. This is not checked.
ascii :: Char -> Builder
ascii c = fromBounded (Bounded.char c)
-- | Encode an UTF8 char. This only uses as much space as is required.
char :: Char -> Builder
char c = fromBounded (Bounded.char c)
unST :: ST s a -> State# s -> (# State# s, a #) unST :: ST s a -> State# s -> (# State# s, a #)
unST (ST f) = f unST (ST f) = f
@ -328,17 +340,20 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
-- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- | Requires exactly 8 bytes. Dump the octets of a 64-bit
-- word in a big-endian fashion. -- word in a big-endian fashion.
word64BE :: Word64 -> Builder word64BE :: Word64 -> Builder
word64BE w = fromUnsafe (Unsafe.word64BE w) word64BE w = fromBounded (Bounded.word64BE w)
-- | Requires exactly 4 bytes. Dump the octets of a 32-bit -- | Requires exactly 4 bytes. Dump the octets of a 32-bit
-- word in a big-endian fashion. -- word in a big-endian fashion.
word32BE :: Word32 -> Builder word32BE :: Word32 -> Builder
word32BE w = fromUnsafe (Unsafe.word32BE w) word32BE w = fromBounded (Bounded.word32BE w)
-- | Requires exactly 2 bytes. Dump the octets of a 16-bit -- | Requires exactly 2 bytes. Dump the octets of a 16-bit
-- word in a big-endian fashion. -- word in a big-endian fashion.
word16BE :: Word16 -> Builder word16BE :: Word16 -> Builder
word16BE w = fromUnsafe (Unsafe.word16BE w) word16BE w = fromBounded (Bounded.word16BE w)
word8 :: Word8 -> Builder
word8 w = fromBounded (Bounded.word8 w)
-- ShortText is already UTF-8 encoded. This is a no-op. -- ShortText is already UTF-8 encoded. This is a no-op.
shortTextToByteArray :: ShortText -> ByteArray shortTextToByteArray :: ShortText -> ByteArray

View file

@ -1,8 +1,9 @@
{-# language GADTSyntax #-} {-# language GADTs #-}
{-# language KindSignatures #-} {-# language KindSignatures #-}
{-# language ScopedTypeVariables #-} {-# language ScopedTypeVariables #-}
{-# language BangPatterns #-} {-# language BangPatterns #-}
{-# language MagicHash #-} {-# language MagicHash #-}
{-# language BinaryLiterals #-}
{-# language UnboxedTuples #-} {-# language UnboxedTuples #-}
{-# language RankNTypes #-} {-# language RankNTypes #-}
{-# language LambdaCase #-} {-# language LambdaCase #-}
@ -10,9 +11,8 @@
{-# language DataKinds #-} {-# language DataKinds #-}
{-# language TypeApplications #-} {-# language TypeApplications #-}
-- | The functions in this module do not check to -- | The functions in this module are explict in the amount of bytes they require.
-- see if there is enough space in the buffer. module Data.ByteArray.Builder.Small.Bounded
module Data.ByteArray.Builder.Small.Unsafe
( -- * Builder ( -- * Builder
Builder(..) Builder(..)
, construct , construct
@ -23,6 +23,11 @@ module Data.ByteArray.Builder.Small.Unsafe
, pasteIO , pasteIO
-- * Combine -- * Combine
, append , append
-- * Bounds Manipulation
, (<=)
, lessThanEqual
, isLessThanEqual
, raise
-- * Encode Integral Types -- * Encode Integral Types
-- ** Human-Readable -- ** Human-Readable
, word64Dec , word64Dec
@ -33,6 +38,8 @@ module Data.ByteArray.Builder.Small.Unsafe
, word32PaddedUpperHex , word32PaddedUpperHex
, word16PaddedUpperHex , word16PaddedUpperHex
, word8PaddedUpperHex , word8PaddedUpperHex
, ascii
, char
-- ** Machine-Readable -- ** Machine-Readable
, word64BE , word64BE
, word32BE , word32BE
@ -53,8 +60,9 @@ import GHC.Word
import GHC.Int import GHC.Int
import Data.Kind import Data.Kind
import GHC.TypeLits (KnownNat,Nat,type (+),natVal') import GHC.TypeLits (KnownNat,Nat,type (+),natVal')
import qualified GHC.TypeLits as GHC
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
import Control.Monad (when) import qualified Control.Category as Cat
import qualified Data.Primitive as PM import qualified Data.Primitive as PM
@ -65,13 +73,16 @@ newtype Builder :: Nat -> Type where
(forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)) (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #))
-> Builder n -> Builder n
knownNat :: KnownNat n => Proxy# n -> Int
knownNat p = fromIntegral (natVal' p)
-- | Execute the builder. This function is safe. -- | Execute the builder. This function is safe.
run :: forall n. KnownNat n run :: forall n. KnownNat n
=> Builder n -- ^ Builder => Builder n -- ^ Builder
-> ByteArray -> ByteArray
{-# inline run #-} {-# inline run #-}
run b = runST $ do run b = runST $ do
arr <- newByteArray (fromIntegral (natVal' (proxy# :: Proxy# n))) arr <- newByteArray (knownNat (proxy# :: Proxy# n))
len <- pasteST b arr 0 len <- pasteST b arr 0
shrinkMutableByteArray arr len shrinkMutableByteArray arr len
unsafeFreezeByteArray arr unsafeFreezeByteArray arr
@ -96,7 +107,7 @@ pasteGrowST :: forall n s. KnownNat n
{-# inline pasteGrowST #-} {-# inline pasteGrowST #-}
pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do
sz0 <- PM.getSizeofMutableByteArray arr0 sz0 <- PM.getSizeofMutableByteArray arr0
let req = fromIntegral (natVal' (proxy# :: Proxy# n)) let req = knownNat (proxy# :: Proxy# n)
let sz1 = off0 + req let sz1 = off0 + req
if sz1 <= sz0 if sz1 <= sz0
then do then do
@ -131,6 +142,33 @@ append (Builder f) (Builder g) =
Builder $ \arr off0 s0 -> case f arr off0 s0 of Builder $ \arr off0 s0 -> case f arr off0 s0 of
(# s1, r #) -> g arr r s1 (# s1, r #) -> g arr r s1
-- | A proof that n is less than or equal to m
newtype (n :: Nat) <= (m :: Nat) = LessThanEqual Int -- m - n
instance Cat.Category (<=) where
id = LessThanEqual 0
-- b <= c (c - b) -> a <= b (b - a) -> a <= c (c - a)
LessThanEqual cb . LessThanEqual ba = LessThanEqual (cb + ba)
-- | Dynamically check than 'n' is less than or equal to 'm'
isLessThanEqual :: (KnownNat n, KnownNat m) => Proxy# n -> Proxy# m -> Maybe (n <= m)
isLessThanEqual n m = if 0 <= diff then Just (LessThanEqual diff) else Nothing
where diff = knownNat m - knownNat n
-- | Statically check than 'n' is less than or equal to 'm'. 'n' and 'm' must be known at compile time.
lessThanEqual :: forall n m. (KnownNat n, KnownNat m, n GHC.<= m) => n <= m
lessThanEqual = LessThanEqual (knownNat (proxy# :: Proxy# m) - knownNat (proxy# :: Proxy# n))
-- | Weaken the bound on the maximum number of bytes required.
-- >>> :{
-- buildNumber :: Either Double Word64 -> Builder 32
-- buildNumber = \case
-- Left d -> doubleDec d
-- Right w -> raise lessThanEqual (word64Dec w)
-- :}
raise :: forall m n. n <= m -> Builder n -> Builder m
raise !_ (Builder f) = Builder f
-- | Encode a double-floating-point number, using decimal notation or -- | Encode a double-floating-point number, using decimal notation or
-- scientific notation depending on the magnitude. This has undefined -- scientific notation depending on the magnitude. This has undefined
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
@ -302,6 +340,76 @@ word8PaddedUpperHex# w# = construct $ \arr off -> do
where where
w = W# w# w = W# w#
-- | Encode an ASCII char.
-- Precondition: Input must be an ASCII character. This is not checked.
ascii :: Char -> Builder 1
ascii c = word8 (fromIntegral @Int @Word8 (ord c))
-- | Encode an UTF8 char. This only uses as much space as is required.
char :: Char -> Builder 4
char c
| codepoint < 0x80 = construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 codepoint)
pure (off + 1)
| codepoint < 0x800 = construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 (byteTwoOne codepoint))
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint))
return (off + 2)
| codepoint >= 0xD800 && codepoint < 0xE000 = construct $ \arr off -> do
-- Codepoint U+FFFD
writeByteArray arr off (0xEF :: Word8)
writeByteArray arr (off + 1) (0xBF :: Word8)
writeByteArray arr (off + 2) (0xBD :: Word8)
return (off + 3)
| codepoint < 0x10000 = construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 (byteThreeOne codepoint))
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint))
writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint))
return (off + 3)
| otherwise = construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 (byteFourOne codepoint))
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint))
writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint))
writeByteArray arr (off + 3) (unsafeWordToWord8 (byteFourFour codepoint))
return (off + 4)
where
codepoint :: Word
codepoint = fromIntegral (ord c)
unsafeWordToWord8 :: Word -> Word8
unsafeWordToWord8 (W# w) = W8# w
-- precondition: codepoint is less than 0x800
byteTwoOne :: Word -> Word
byteTwoOne w = unsafeShiftR w 6 .|. 0b11000000
byteTwoTwo :: Word -> Word
byteTwoTwo w = (w .&. 0b00111111) .|. 0b10000000
-- precondition: codepoint is less than 0x1000
byteThreeOne :: Word -> Word
byteThreeOne w = unsafeShiftR w 12 .|. 0b11100000
byteThreeTwo :: Word -> Word
byteThreeTwo w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
byteThreeThree :: Word -> Word
byteThreeThree w = (w .&. 0b00111111) .|. 0b10000000
-- precondition: codepoint is less than 0x110000
byteFourOne :: Word -> Word
byteFourOne w = unsafeShiftR w 18 .|. 0b11110000
byteFourTwo :: Word -> Word
byteFourTwo w = (0b00111111 .&. unsafeShiftR w 12) .|. 0b10000000
byteFourThree :: Word -> Word
byteFourThree w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
byteFourFour :: Word -> Word
byteFourFour w = (0b00111111 .&. w) .|. 0b10000000
-- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- | Requires exactly 8 bytes. Dump the octets of a 64-bit
-- word in a big-endian fashion. -- word in a big-endian fashion.
word64BE :: Word64 -> Builder 8 word64BE :: Word64 -> Builder 8