diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal index 8fe031a..f5e511f 100644 --- a/small-bytearray-builder.cabal +++ b/small-bytearray-builder.cabal @@ -37,7 +37,7 @@ flag checked library exposed-modules: Data.ByteArray.Builder.Small - Data.ByteArray.Builder.Small.Unsafe + Data.ByteArray.Builder.Small.Bounded build-depends: , base >=4.12.0.0 && <5 , byteslice >=0.1 && <0.2 diff --git a/src/Data/ByteArray/Builder/Small.hs b/src/Data/ByteArray/Builder/Small.hs index f2f0e54..a9cfabe 100644 --- a/src/Data/ByteArray/Builder/Small.hs +++ b/src/Data/ByteArray/Builder/Small.hs @@ -7,10 +7,10 @@ {-# language UnboxedTuples #-} module Data.ByteArray.Builder.Small - ( -- * Unsafe Primitives + ( -- * Bounded Primitives Builder(..) , construct - , fromUnsafe + , fromBounded -- * Evaluation , run , pasteST @@ -34,10 +34,13 @@ module Data.ByteArray.Builder.Small , word32PaddedUpperHex , word16PaddedUpperHex , word8PaddedUpperHex + , ascii + , char -- ** Machine-Readable , word64BE , word32BE , word16BE + , word8 -- * Encode Floating-Point Types -- ** Human-Readable , doubleDec @@ -62,7 +65,7 @@ import qualified GHC.Exts as Exts import qualified Data.Text.Short as TS import qualified Data.Primitive as PM import qualified Data.Vector as V -import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe +import qualified Data.ByteArray.Builder.Small.Bounded as Bounded -- | An unmaterialized sequence of bytes that may be pasted -- into a mutable byte array. @@ -183,9 +186,9 @@ construct f = Builder Nothing -> (# s1, (-1#) #) Just (I# n) -> (# s1, n #) -fromUnsafe :: forall n. KnownNat n => Unsafe.Builder n -> Builder -{-# inline fromUnsafe #-} -fromUnsafe (Unsafe.Builder f) = Builder $ \arr off len s0 -> +fromBounded :: forall n. KnownNat n => Bounded.Builder n -> Builder +{-# inline fromBounded #-} +fromBounded (Bounded.Builder f) = Builder $ \arr off len s0 -> case fromIntegral (natVal' (proxy# :: Proxy# n)) of I# req -> case len >=# req of 1# -> f arr off s0 @@ -223,8 +226,8 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1) else do write2 dst doff '\\' 'u' - doff' <- Unsafe.pasteST - (Unsafe.word16PaddedUpperHex (fromIntegral (c2w c))) + doff' <- Bounded.pasteST + (Bounded.word16PaddedUpperHex (fromIntegral (c2w c))) dst (doff + 2) go (soff + 1) (slen - 1) doff' else pure doff @@ -262,61 +265,70 @@ shortTextJsonString a = -- This encoding never starts with a zero unless the -- argument was zero. word64Dec :: Word64 -> Builder -word64Dec w = fromUnsafe (Unsafe.word64Dec w) +word64Dec w = fromBounded (Bounded.word64Dec w) -- | Encodes an unsigned 16-bit integer as decimal. -- This encoding never starts with a zero unless the -- argument was zero. word32Dec :: Word32 -> Builder -word32Dec w = fromUnsafe (Unsafe.word32Dec w) +word32Dec w = fromBounded (Bounded.word32Dec w) -- | Encodes an unsigned 16-bit integer as decimal. -- This encoding never starts with a zero unless the -- argument was zero. word16Dec :: Word16 -> Builder -word16Dec w = fromUnsafe (Unsafe.word16Dec w) +word16Dec w = fromBounded (Bounded.word16Dec w) -- | Encode a double-floating-point number, using decimal notation or -- scientific notation depending on the magnitude. This has undefined -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not -- crash, but the generated numbers will be nonsense. doubleDec :: Double -> Builder -doubleDec w = fromUnsafe (Unsafe.doubleDec w) +doubleDec w = fromBounded (Bounded.doubleDec w) -- | Encodes a signed 64-bit integer as decimal. -- This encoding never starts with a zero unless the argument was zero. -- Negative numbers are preceded by a minus sign. Positive numbers -- are not preceded by anything. int64Dec :: Int64 -> Builder -int64Dec w = fromUnsafe (Unsafe.int64Dec w) +int64Dec w = fromBounded (Bounded.int64Dec w) -- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 16 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @00000000000003FE@. word64PaddedUpperHex :: Word64 -> Builder word64PaddedUpperHex w = - fromUnsafe (Unsafe.word64PaddedUpperHex w) + fromBounded (Bounded.word64PaddedUpperHex w) -- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 8 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @000003FE@. word32PaddedUpperHex :: Word32 -> Builder word32PaddedUpperHex w = - fromUnsafe (Unsafe.word32PaddedUpperHex w) + fromBounded (Bounded.word32PaddedUpperHex w) -- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 4 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @03FE@. word16PaddedUpperHex :: Word16 -> Builder word16PaddedUpperHex w = - fromUnsafe (Unsafe.word16PaddedUpperHex w) + fromBounded (Bounded.word16PaddedUpperHex w) -- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 2 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 11 as @0B@. word8PaddedUpperHex :: Word8 -> Builder word8PaddedUpperHex w = - fromUnsafe (Unsafe.word8PaddedUpperHex w) + fromBounded (Bounded.word8PaddedUpperHex w) + +-- | Encode an ASCII char. +-- Precondition: Input must be an ASCII character. This is not checked. +ascii :: Char -> Builder +ascii c = fromBounded (Bounded.char c) + +-- | Encode an UTF8 char. This only uses as much space as is required. +char :: Char -> Builder +char c = fromBounded (Bounded.char c) unST :: ST s a -> State# s -> (# State# s, a #) unST (ST f) = f @@ -328,17 +340,20 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) = -- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- word in a big-endian fashion. word64BE :: Word64 -> Builder -word64BE w = fromUnsafe (Unsafe.word64BE w) +word64BE w = fromBounded (Bounded.word64BE w) -- | Requires exactly 4 bytes. Dump the octets of a 32-bit -- word in a big-endian fashion. word32BE :: Word32 -> Builder -word32BE w = fromUnsafe (Unsafe.word32BE w) +word32BE w = fromBounded (Bounded.word32BE w) -- | Requires exactly 2 bytes. Dump the octets of a 16-bit -- word in a big-endian fashion. word16BE :: Word16 -> Builder -word16BE w = fromUnsafe (Unsafe.word16BE w) +word16BE w = fromBounded (Bounded.word16BE w) + +word8 :: Word8 -> Builder +word8 w = fromBounded (Bounded.word8 w) -- ShortText is already UTF-8 encoded. This is a no-op. shortTextToByteArray :: ShortText -> ByteArray @@ -349,4 +364,4 @@ indexChar8Array :: ByteArray -> Int -> Char indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i) c2w :: Char -> Word8 -c2w = fromIntegral . ord +c2w = fromIntegral . ord diff --git a/src/Data/ByteArray/Builder/Small/Unsafe.hs b/src/Data/ByteArray/Builder/Small/Bounded.hs similarity index 79% rename from src/Data/ByteArray/Builder/Small/Unsafe.hs rename to src/Data/ByteArray/Builder/Small/Bounded.hs index 1ae6695..429e0c5 100644 --- a/src/Data/ByteArray/Builder/Small/Unsafe.hs +++ b/src/Data/ByteArray/Builder/Small/Bounded.hs @@ -1,8 +1,9 @@ -{-# language GADTSyntax #-} +{-# language GADTs #-} {-# language KindSignatures #-} {-# language ScopedTypeVariables #-} {-# language BangPatterns #-} {-# language MagicHash #-} +{-# language BinaryLiterals #-} {-# language UnboxedTuples #-} {-# language RankNTypes #-} {-# language LambdaCase #-} @@ -10,9 +11,8 @@ {-# language DataKinds #-} {-# language TypeApplications #-} --- | The functions in this module do not check to --- see if there is enough space in the buffer. -module Data.ByteArray.Builder.Small.Unsafe +-- | The functions in this module are explict in the amount of bytes they require. +module Data.ByteArray.Builder.Small.Bounded ( -- * Builder Builder(..) , construct @@ -23,6 +23,11 @@ module Data.ByteArray.Builder.Small.Unsafe , pasteIO -- * Combine , append + -- * Bounds Manipulation + , (<=) + , lessThanEqual + , isLessThanEqual + , raise -- * Encode Integral Types -- ** Human-Readable , word64Dec @@ -33,6 +38,8 @@ module Data.ByteArray.Builder.Small.Unsafe , word32PaddedUpperHex , word16PaddedUpperHex , word8PaddedUpperHex + , ascii + , char -- ** Machine-Readable , word64BE , word32BE @@ -53,8 +60,9 @@ import GHC.Word import GHC.Int import Data.Kind import GHC.TypeLits (KnownNat,Nat,type (+),natVal') +import qualified GHC.TypeLits as GHC import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) -import Control.Monad (when) +import qualified Control.Category as Cat import qualified Data.Primitive as PM @@ -65,13 +73,16 @@ newtype Builder :: Nat -> Type where (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)) -> Builder n +knownNat :: KnownNat n => Proxy# n -> Int +knownNat p = fromIntegral (natVal' p) + -- | Execute the builder. This function is safe. run :: forall n. KnownNat n => Builder n -- ^ Builder -> ByteArray {-# inline run #-} run b = runST $ do - arr <- newByteArray (fromIntegral (natVal' (proxy# :: Proxy# n))) + arr <- newByteArray (knownNat (proxy# :: Proxy# n)) len <- pasteST b arr 0 shrinkMutableByteArray arr len unsafeFreezeByteArray arr @@ -96,7 +107,7 @@ pasteGrowST :: forall n s. KnownNat n {-# inline pasteGrowST #-} pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do sz0 <- PM.getSizeofMutableByteArray arr0 - let req = fromIntegral (natVal' (proxy# :: Proxy# n)) + let req = knownNat (proxy# :: Proxy# n) let sz1 = off0 + req if sz1 <= sz0 then do @@ -131,6 +142,33 @@ append (Builder f) (Builder g) = Builder $ \arr off0 s0 -> case f arr off0 s0 of (# s1, r #) -> g arr r s1 +-- | A proof that n is less than or equal to m +newtype (n :: Nat) <= (m :: Nat) = LessThanEqual Int -- m - n + +instance Cat.Category (<=) where + id = LessThanEqual 0 + -- b <= c (c - b) -> a <= b (b - a) -> a <= c (c - a) + LessThanEqual cb . LessThanEqual ba = LessThanEqual (cb + ba) + +-- | Dynamically check than 'n' is less than or equal to 'm' +isLessThanEqual :: (KnownNat n, KnownNat m) => Proxy# n -> Proxy# m -> Maybe (n <= m) +isLessThanEqual n m = if 0 <= diff then Just (LessThanEqual diff) else Nothing + where diff = knownNat m - knownNat n + +-- | Statically check than 'n' is less than or equal to 'm'. 'n' and 'm' must be known at compile time. +lessThanEqual :: forall n m. (KnownNat n, KnownNat m, n GHC.<= m) => n <= m +lessThanEqual = LessThanEqual (knownNat (proxy# :: Proxy# m) - knownNat (proxy# :: Proxy# n)) + +-- | Weaken the bound on the maximum number of bytes required. +-- >>> :{ +-- buildNumber :: Either Double Word64 -> Builder 32 +-- buildNumber = \case +-- Left d -> doubleDec d +-- Right w -> raise lessThanEqual (word64Dec w) +-- :} +raise :: forall m n. n <= m -> Builder n -> Builder m +raise !_ (Builder f) = Builder f + -- | Encode a double-floating-point number, using decimal notation or -- scientific notation depending on the magnitude. This has undefined -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not @@ -302,6 +340,76 @@ word8PaddedUpperHex# w# = construct $ \arr off -> do where w = W# w# +-- | Encode an ASCII char. +-- Precondition: Input must be an ASCII character. This is not checked. +ascii :: Char -> Builder 1 +ascii c = word8 (fromIntegral @Int @Word8 (ord c)) + +-- | Encode an UTF8 char. This only uses as much space as is required. +char :: Char -> Builder 4 +char c + | codepoint < 0x80 = construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 codepoint) + pure (off + 1) + | codepoint < 0x800 = construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 (byteTwoOne codepoint)) + writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint)) + return (off + 2) + | codepoint >= 0xD800 && codepoint < 0xE000 = construct $ \arr off -> do + -- Codepoint U+FFFD + writeByteArray arr off (0xEF :: Word8) + writeByteArray arr (off + 1) (0xBF :: Word8) + writeByteArray arr (off + 2) (0xBD :: Word8) + return (off + 3) + | codepoint < 0x10000 = construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 (byteThreeOne codepoint)) + writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint)) + writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint)) + return (off + 3) + | otherwise = construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 (byteFourOne codepoint)) + writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint)) + writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint)) + writeByteArray arr (off + 3) (unsafeWordToWord8 (byteFourFour codepoint)) + return (off + 4) + + where + codepoint :: Word + codepoint = fromIntegral (ord c) + + unsafeWordToWord8 :: Word -> Word8 + unsafeWordToWord8 (W# w) = W8# w + + -- precondition: codepoint is less than 0x800 + byteTwoOne :: Word -> Word + byteTwoOne w = unsafeShiftR w 6 .|. 0b11000000 + + byteTwoTwo :: Word -> Word + byteTwoTwo w = (w .&. 0b00111111) .|. 0b10000000 + + -- precondition: codepoint is less than 0x1000 + byteThreeOne :: Word -> Word + byteThreeOne w = unsafeShiftR w 12 .|. 0b11100000 + + byteThreeTwo :: Word -> Word + byteThreeTwo w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000 + + byteThreeThree :: Word -> Word + byteThreeThree w = (w .&. 0b00111111) .|. 0b10000000 + + -- precondition: codepoint is less than 0x110000 + byteFourOne :: Word -> Word + byteFourOne w = unsafeShiftR w 18 .|. 0b11110000 + + byteFourTwo :: Word -> Word + byteFourTwo w = (0b00111111 .&. unsafeShiftR w 12) .|. 0b10000000 + + byteFourThree :: Word -> Word + byteFourThree w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000 + + byteFourFour :: Word -> Word + byteFourFour w = (0b00111111 .&. w) .|. 0b10000000 + -- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- word in a big-endian fashion. word64BE :: Word64 -> Builder 8