switch unsafe to bounded + char encoding
This commit is contained in:
parent
22dfde2936
commit
9a14600a38
3 changed files with 152 additions and 29 deletions
|
@ -37,7 +37,7 @@ flag checked
|
|||
library
|
||||
exposed-modules:
|
||||
Data.ByteArray.Builder.Small
|
||||
Data.ByteArray.Builder.Small.Unsafe
|
||||
Data.ByteArray.Builder.Small.Bounded
|
||||
build-depends:
|
||||
, base >=4.12.0.0 && <5
|
||||
, byteslice >=0.1 && <0.2
|
||||
|
|
|
@ -7,10 +7,10 @@
|
|||
{-# language UnboxedTuples #-}
|
||||
|
||||
module Data.ByteArray.Builder.Small
|
||||
( -- * Unsafe Primitives
|
||||
( -- * Bounded Primitives
|
||||
Builder(..)
|
||||
, construct
|
||||
, fromUnsafe
|
||||
, fromBounded
|
||||
-- * Evaluation
|
||||
, run
|
||||
, pasteST
|
||||
|
@ -34,10 +34,13 @@ module Data.ByteArray.Builder.Small
|
|||
, word32PaddedUpperHex
|
||||
, word16PaddedUpperHex
|
||||
, word8PaddedUpperHex
|
||||
, ascii
|
||||
, char
|
||||
-- ** Machine-Readable
|
||||
, word64BE
|
||||
, word32BE
|
||||
, word16BE
|
||||
, word8
|
||||
-- * Encode Floating-Point Types
|
||||
-- ** Human-Readable
|
||||
, doubleDec
|
||||
|
@ -62,7 +65,7 @@ import qualified GHC.Exts as Exts
|
|||
import qualified Data.Text.Short as TS
|
||||
import qualified Data.Primitive as PM
|
||||
import qualified Data.Vector as V
|
||||
import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
|
||||
import qualified Data.ByteArray.Builder.Small.Bounded as Bounded
|
||||
|
||||
-- | An unmaterialized sequence of bytes that may be pasted
|
||||
-- into a mutable byte array.
|
||||
|
@ -183,9 +186,9 @@ construct f = Builder
|
|||
Nothing -> (# s1, (-1#) #)
|
||||
Just (I# n) -> (# s1, n #)
|
||||
|
||||
fromUnsafe :: forall n. KnownNat n => Unsafe.Builder n -> Builder
|
||||
{-# inline fromUnsafe #-}
|
||||
fromUnsafe (Unsafe.Builder f) = Builder $ \arr off len s0 ->
|
||||
fromBounded :: forall n. KnownNat n => Bounded.Builder n -> Builder
|
||||
{-# inline fromBounded #-}
|
||||
fromBounded (Bounded.Builder f) = Builder $ \arr off len s0 ->
|
||||
case fromIntegral (natVal' (proxy# :: Proxy# n)) of
|
||||
I# req -> case len >=# req of
|
||||
1# -> f arr off s0
|
||||
|
@ -223,8 +226,8 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0
|
|||
then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
|
||||
else do
|
||||
write2 dst doff '\\' 'u'
|
||||
doff' <- Unsafe.pasteST
|
||||
(Unsafe.word16PaddedUpperHex (fromIntegral (c2w c)))
|
||||
doff' <- Bounded.pasteST
|
||||
(Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
|
||||
dst (doff + 2)
|
||||
go (soff + 1) (slen - 1) doff'
|
||||
else pure doff
|
||||
|
@ -262,61 +265,70 @@ shortTextJsonString a =
|
|||
-- This encoding never starts with a zero unless the
|
||||
-- argument was zero.
|
||||
word64Dec :: Word64 -> Builder
|
||||
word64Dec w = fromUnsafe (Unsafe.word64Dec w)
|
||||
word64Dec w = fromBounded (Bounded.word64Dec w)
|
||||
|
||||
-- | Encodes an unsigned 16-bit integer as decimal.
|
||||
-- This encoding never starts with a zero unless the
|
||||
-- argument was zero.
|
||||
word32Dec :: Word32 -> Builder
|
||||
word32Dec w = fromUnsafe (Unsafe.word32Dec w)
|
||||
word32Dec w = fromBounded (Bounded.word32Dec w)
|
||||
|
||||
-- | Encodes an unsigned 16-bit integer as decimal.
|
||||
-- This encoding never starts with a zero unless the
|
||||
-- argument was zero.
|
||||
word16Dec :: Word16 -> Builder
|
||||
word16Dec w = fromUnsafe (Unsafe.word16Dec w)
|
||||
word16Dec w = fromBounded (Bounded.word16Dec w)
|
||||
|
||||
-- | Encode a double-floating-point number, using decimal notation or
|
||||
-- scientific notation depending on the magnitude. This has undefined
|
||||
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
|
||||
-- crash, but the generated numbers will be nonsense.
|
||||
doubleDec :: Double -> Builder
|
||||
doubleDec w = fromUnsafe (Unsafe.doubleDec w)
|
||||
doubleDec w = fromBounded (Bounded.doubleDec w)
|
||||
|
||||
-- | Encodes a signed 64-bit integer as decimal.
|
||||
-- This encoding never starts with a zero unless the argument was zero.
|
||||
-- Negative numbers are preceded by a minus sign. Positive numbers
|
||||
-- are not preceded by anything.
|
||||
int64Dec :: Int64 -> Builder
|
||||
int64Dec w = fromUnsafe (Unsafe.int64Dec w)
|
||||
int64Dec w = fromBounded (Bounded.int64Dec w)
|
||||
|
||||
-- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding
|
||||
-- the encoding to 16 digits. This uses uppercase for the alphabetical
|
||||
-- digits. For example, this encodes the number 1022 as @00000000000003FE@.
|
||||
word64PaddedUpperHex :: Word64 -> Builder
|
||||
word64PaddedUpperHex w =
|
||||
fromUnsafe (Unsafe.word64PaddedUpperHex w)
|
||||
fromBounded (Bounded.word64PaddedUpperHex w)
|
||||
|
||||
-- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding
|
||||
-- the encoding to 8 digits. This uses uppercase for the alphabetical
|
||||
-- digits. For example, this encodes the number 1022 as @000003FE@.
|
||||
word32PaddedUpperHex :: Word32 -> Builder
|
||||
word32PaddedUpperHex w =
|
||||
fromUnsafe (Unsafe.word32PaddedUpperHex w)
|
||||
fromBounded (Bounded.word32PaddedUpperHex w)
|
||||
|
||||
-- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding
|
||||
-- the encoding to 4 digits. This uses uppercase for the alphabetical
|
||||
-- digits. For example, this encodes the number 1022 as @03FE@.
|
||||
word16PaddedUpperHex :: Word16 -> Builder
|
||||
word16PaddedUpperHex w =
|
||||
fromUnsafe (Unsafe.word16PaddedUpperHex w)
|
||||
fromBounded (Bounded.word16PaddedUpperHex w)
|
||||
|
||||
-- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding
|
||||
-- the encoding to 2 digits. This uses uppercase for the alphabetical
|
||||
-- digits. For example, this encodes the number 11 as @0B@.
|
||||
word8PaddedUpperHex :: Word8 -> Builder
|
||||
word8PaddedUpperHex w =
|
||||
fromUnsafe (Unsafe.word8PaddedUpperHex w)
|
||||
fromBounded (Bounded.word8PaddedUpperHex w)
|
||||
|
||||
-- | Encode an ASCII char.
|
||||
-- Precondition: Input must be an ASCII character. This is not checked.
|
||||
ascii :: Char -> Builder
|
||||
ascii c = fromBounded (Bounded.char c)
|
||||
|
||||
-- | Encode an UTF8 char. This only uses as much space as is required.
|
||||
char :: Char -> Builder
|
||||
char c = fromBounded (Bounded.char c)
|
||||
|
||||
unST :: ST s a -> State# s -> (# State# s, a #)
|
||||
unST (ST f) = f
|
||||
|
@ -328,17 +340,20 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
|
|||
-- | Requires exactly 8 bytes. Dump the octets of a 64-bit
|
||||
-- word in a big-endian fashion.
|
||||
word64BE :: Word64 -> Builder
|
||||
word64BE w = fromUnsafe (Unsafe.word64BE w)
|
||||
word64BE w = fromBounded (Bounded.word64BE w)
|
||||
|
||||
-- | Requires exactly 4 bytes. Dump the octets of a 32-bit
|
||||
-- word in a big-endian fashion.
|
||||
word32BE :: Word32 -> Builder
|
||||
word32BE w = fromUnsafe (Unsafe.word32BE w)
|
||||
word32BE w = fromBounded (Bounded.word32BE w)
|
||||
|
||||
-- | Requires exactly 2 bytes. Dump the octets of a 16-bit
|
||||
-- word in a big-endian fashion.
|
||||
word16BE :: Word16 -> Builder
|
||||
word16BE w = fromUnsafe (Unsafe.word16BE w)
|
||||
word16BE w = fromBounded (Bounded.word16BE w)
|
||||
|
||||
word8 :: Word8 -> Builder
|
||||
word8 w = fromBounded (Bounded.word8 w)
|
||||
|
||||
-- ShortText is already UTF-8 encoded. This is a no-op.
|
||||
shortTextToByteArray :: ShortText -> ByteArray
|
||||
|
@ -349,4 +364,4 @@ indexChar8Array :: ByteArray -> Int -> Char
|
|||
indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
|
||||
|
||||
c2w :: Char -> Word8
|
||||
c2w = fromIntegral . ord
|
||||
c2w = fromIntegral . ord
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
{-# language GADTSyntax #-}
|
||||
{-# language GADTs #-}
|
||||
{-# language KindSignatures #-}
|
||||
{-# language ScopedTypeVariables #-}
|
||||
{-# language BangPatterns #-}
|
||||
{-# language MagicHash #-}
|
||||
{-# language BinaryLiterals #-}
|
||||
{-# language UnboxedTuples #-}
|
||||
{-# language RankNTypes #-}
|
||||
{-# language LambdaCase #-}
|
||||
|
@ -10,9 +11,8 @@
|
|||
{-# language DataKinds #-}
|
||||
{-# language TypeApplications #-}
|
||||
|
||||
-- | The functions in this module do not check to
|
||||
-- see if there is enough space in the buffer.
|
||||
module Data.ByteArray.Builder.Small.Unsafe
|
||||
-- | The functions in this module are explict in the amount of bytes they require.
|
||||
module Data.ByteArray.Builder.Small.Bounded
|
||||
( -- * Builder
|
||||
Builder(..)
|
||||
, construct
|
||||
|
@ -23,6 +23,11 @@ module Data.ByteArray.Builder.Small.Unsafe
|
|||
, pasteIO
|
||||
-- * Combine
|
||||
, append
|
||||
-- * Bounds Manipulation
|
||||
, (<=)
|
||||
, lessThanEqual
|
||||
, isLessThanEqual
|
||||
, raise
|
||||
-- * Encode Integral Types
|
||||
-- ** Human-Readable
|
||||
, word64Dec
|
||||
|
@ -33,6 +38,8 @@ module Data.ByteArray.Builder.Small.Unsafe
|
|||
, word32PaddedUpperHex
|
||||
, word16PaddedUpperHex
|
||||
, word8PaddedUpperHex
|
||||
, ascii
|
||||
, char
|
||||
-- ** Machine-Readable
|
||||
, word64BE
|
||||
, word32BE
|
||||
|
@ -53,8 +60,9 @@ import GHC.Word
|
|||
import GHC.Int
|
||||
import Data.Kind
|
||||
import GHC.TypeLits (KnownNat,Nat,type (+),natVal')
|
||||
import qualified GHC.TypeLits as GHC
|
||||
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
|
||||
import Control.Monad (when)
|
||||
import qualified Control.Category as Cat
|
||||
|
||||
import qualified Data.Primitive as PM
|
||||
|
||||
|
@ -65,13 +73,16 @@ newtype Builder :: Nat -> Type where
|
|||
(forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #))
|
||||
-> Builder n
|
||||
|
||||
knownNat :: KnownNat n => Proxy# n -> Int
|
||||
knownNat p = fromIntegral (natVal' p)
|
||||
|
||||
-- | Execute the builder. This function is safe.
|
||||
run :: forall n. KnownNat n
|
||||
=> Builder n -- ^ Builder
|
||||
-> ByteArray
|
||||
{-# inline run #-}
|
||||
run b = runST $ do
|
||||
arr <- newByteArray (fromIntegral (natVal' (proxy# :: Proxy# n)))
|
||||
arr <- newByteArray (knownNat (proxy# :: Proxy# n))
|
||||
len <- pasteST b arr 0
|
||||
shrinkMutableByteArray arr len
|
||||
unsafeFreezeByteArray arr
|
||||
|
@ -96,7 +107,7 @@ pasteGrowST :: forall n s. KnownNat n
|
|||
{-# inline pasteGrowST #-}
|
||||
pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do
|
||||
sz0 <- PM.getSizeofMutableByteArray arr0
|
||||
let req = fromIntegral (natVal' (proxy# :: Proxy# n))
|
||||
let req = knownNat (proxy# :: Proxy# n)
|
||||
let sz1 = off0 + req
|
||||
if sz1 <= sz0
|
||||
then do
|
||||
|
@ -131,6 +142,33 @@ append (Builder f) (Builder g) =
|
|||
Builder $ \arr off0 s0 -> case f arr off0 s0 of
|
||||
(# s1, r #) -> g arr r s1
|
||||
|
||||
-- | A proof that n is less than or equal to m
|
||||
newtype (n :: Nat) <= (m :: Nat) = LessThanEqual Int -- m - n
|
||||
|
||||
instance Cat.Category (<=) where
|
||||
id = LessThanEqual 0
|
||||
-- b <= c (c - b) -> a <= b (b - a) -> a <= c (c - a)
|
||||
LessThanEqual cb . LessThanEqual ba = LessThanEqual (cb + ba)
|
||||
|
||||
-- | Dynamically check than 'n' is less than or equal to 'm'
|
||||
isLessThanEqual :: (KnownNat n, KnownNat m) => Proxy# n -> Proxy# m -> Maybe (n <= m)
|
||||
isLessThanEqual n m = if 0 <= diff then Just (LessThanEqual diff) else Nothing
|
||||
where diff = knownNat m - knownNat n
|
||||
|
||||
-- | Statically check than 'n' is less than or equal to 'm'. 'n' and 'm' must be known at compile time.
|
||||
lessThanEqual :: forall n m. (KnownNat n, KnownNat m, n GHC.<= m) => n <= m
|
||||
lessThanEqual = LessThanEqual (knownNat (proxy# :: Proxy# m) - knownNat (proxy# :: Proxy# n))
|
||||
|
||||
-- | Weaken the bound on the maximum number of bytes required.
|
||||
-- >>> :{
|
||||
-- buildNumber :: Either Double Word64 -> Builder 32
|
||||
-- buildNumber = \case
|
||||
-- Left d -> doubleDec d
|
||||
-- Right w -> raise lessThanEqual (word64Dec w)
|
||||
-- :}
|
||||
raise :: forall m n. n <= m -> Builder n -> Builder m
|
||||
raise !_ (Builder f) = Builder f
|
||||
|
||||
-- | Encode a double-floating-point number, using decimal notation or
|
||||
-- scientific notation depending on the magnitude. This has undefined
|
||||
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
|
||||
|
@ -302,6 +340,76 @@ word8PaddedUpperHex# w# = construct $ \arr off -> do
|
|||
where
|
||||
w = W# w#
|
||||
|
||||
-- | Encode an ASCII char.
|
||||
-- Precondition: Input must be an ASCII character. This is not checked.
|
||||
ascii :: Char -> Builder 1
|
||||
ascii c = word8 (fromIntegral @Int @Word8 (ord c))
|
||||
|
||||
-- | Encode an UTF8 char. This only uses as much space as is required.
|
||||
char :: Char -> Builder 4
|
||||
char c
|
||||
| codepoint < 0x80 = construct $ \arr off -> do
|
||||
writeByteArray arr off (unsafeWordToWord8 codepoint)
|
||||
pure (off + 1)
|
||||
| codepoint < 0x800 = construct $ \arr off -> do
|
||||
writeByteArray arr off (unsafeWordToWord8 (byteTwoOne codepoint))
|
||||
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint))
|
||||
return (off + 2)
|
||||
| codepoint >= 0xD800 && codepoint < 0xE000 = construct $ \arr off -> do
|
||||
-- Codepoint U+FFFD
|
||||
writeByteArray arr off (0xEF :: Word8)
|
||||
writeByteArray arr (off + 1) (0xBF :: Word8)
|
||||
writeByteArray arr (off + 2) (0xBD :: Word8)
|
||||
return (off + 3)
|
||||
| codepoint < 0x10000 = construct $ \arr off -> do
|
||||
writeByteArray arr off (unsafeWordToWord8 (byteThreeOne codepoint))
|
||||
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint))
|
||||
writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint))
|
||||
return (off + 3)
|
||||
| otherwise = construct $ \arr off -> do
|
||||
writeByteArray arr off (unsafeWordToWord8 (byteFourOne codepoint))
|
||||
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint))
|
||||
writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint))
|
||||
writeByteArray arr (off + 3) (unsafeWordToWord8 (byteFourFour codepoint))
|
||||
return (off + 4)
|
||||
|
||||
where
|
||||
codepoint :: Word
|
||||
codepoint = fromIntegral (ord c)
|
||||
|
||||
unsafeWordToWord8 :: Word -> Word8
|
||||
unsafeWordToWord8 (W# w) = W8# w
|
||||
|
||||
-- precondition: codepoint is less than 0x800
|
||||
byteTwoOne :: Word -> Word
|
||||
byteTwoOne w = unsafeShiftR w 6 .|. 0b11000000
|
||||
|
||||
byteTwoTwo :: Word -> Word
|
||||
byteTwoTwo w = (w .&. 0b00111111) .|. 0b10000000
|
||||
|
||||
-- precondition: codepoint is less than 0x1000
|
||||
byteThreeOne :: Word -> Word
|
||||
byteThreeOne w = unsafeShiftR w 12 .|. 0b11100000
|
||||
|
||||
byteThreeTwo :: Word -> Word
|
||||
byteThreeTwo w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
|
||||
|
||||
byteThreeThree :: Word -> Word
|
||||
byteThreeThree w = (w .&. 0b00111111) .|. 0b10000000
|
||||
|
||||
-- precondition: codepoint is less than 0x110000
|
||||
byteFourOne :: Word -> Word
|
||||
byteFourOne w = unsafeShiftR w 18 .|. 0b11110000
|
||||
|
||||
byteFourTwo :: Word -> Word
|
||||
byteFourTwo w = (0b00111111 .&. unsafeShiftR w 12) .|. 0b10000000
|
||||
|
||||
byteFourThree :: Word -> Word
|
||||
byteFourThree w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
|
||||
|
||||
byteFourFour :: Word -> Word
|
||||
byteFourFour w = (0b00111111 .&. w) .|. 0b10000000
|
||||
|
||||
-- | Requires exactly 8 bytes. Dump the octets of a 64-bit
|
||||
-- word in a big-endian fashion.
|
||||
word64BE :: Word64 -> Builder 8
|
Loading…
Reference in a new issue