From 9a14600a388c00d07b00b69126cd3e7b5ee9b1dd Mon Sep 17 00:00:00 2001 From: Alice McKean Date: Sun, 1 Sep 2019 22:43:54 -0700 Subject: [PATCH 1/4] switch unsafe to bounded + char encoding --- small-bytearray-builder.cabal | 2 +- src/Data/ByteArray/Builder/Small.hs | 57 +++++--- .../Builder/Small/{Unsafe.hs => Bounded.hs} | 122 +++++++++++++++++- 3 files changed, 152 insertions(+), 29 deletions(-) rename src/Data/ByteArray/Builder/Small/{Unsafe.hs => Bounded.hs} (79%) diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal index 8fe031a..f5e511f 100644 --- a/small-bytearray-builder.cabal +++ b/small-bytearray-builder.cabal @@ -37,7 +37,7 @@ flag checked library exposed-modules: Data.ByteArray.Builder.Small - Data.ByteArray.Builder.Small.Unsafe + Data.ByteArray.Builder.Small.Bounded build-depends: , base >=4.12.0.0 && <5 , byteslice >=0.1 && <0.2 diff --git a/src/Data/ByteArray/Builder/Small.hs b/src/Data/ByteArray/Builder/Small.hs index f2f0e54..a9cfabe 100644 --- a/src/Data/ByteArray/Builder/Small.hs +++ b/src/Data/ByteArray/Builder/Small.hs @@ -7,10 +7,10 @@ {-# language UnboxedTuples #-} module Data.ByteArray.Builder.Small - ( -- * Unsafe Primitives + ( -- * Bounded Primitives Builder(..) , construct - , fromUnsafe + , fromBounded -- * Evaluation , run , pasteST @@ -34,10 +34,13 @@ module Data.ByteArray.Builder.Small , word32PaddedUpperHex , word16PaddedUpperHex , word8PaddedUpperHex + , ascii + , char -- ** Machine-Readable , word64BE , word32BE , word16BE + , word8 -- * Encode Floating-Point Types -- ** Human-Readable , doubleDec @@ -62,7 +65,7 @@ import qualified GHC.Exts as Exts import qualified Data.Text.Short as TS import qualified Data.Primitive as PM import qualified Data.Vector as V -import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe +import qualified Data.ByteArray.Builder.Small.Bounded as Bounded -- | An unmaterialized sequence of bytes that may be pasted -- into a mutable byte array. @@ -183,9 +186,9 @@ construct f = Builder Nothing -> (# s1, (-1#) #) Just (I# n) -> (# s1, n #) -fromUnsafe :: forall n. KnownNat n => Unsafe.Builder n -> Builder -{-# inline fromUnsafe #-} -fromUnsafe (Unsafe.Builder f) = Builder $ \arr off len s0 -> +fromBounded :: forall n. KnownNat n => Bounded.Builder n -> Builder +{-# inline fromBounded #-} +fromBounded (Bounded.Builder f) = Builder $ \arr off len s0 -> case fromIntegral (natVal' (proxy# :: Proxy# n)) of I# req -> case len >=# req of 1# -> f arr off s0 @@ -223,8 +226,8 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1) else do write2 dst doff '\\' 'u' - doff' <- Unsafe.pasteST - (Unsafe.word16PaddedUpperHex (fromIntegral (c2w c))) + doff' <- Bounded.pasteST + (Bounded.word16PaddedUpperHex (fromIntegral (c2w c))) dst (doff + 2) go (soff + 1) (slen - 1) doff' else pure doff @@ -262,61 +265,70 @@ shortTextJsonString a = -- This encoding never starts with a zero unless the -- argument was zero. word64Dec :: Word64 -> Builder -word64Dec w = fromUnsafe (Unsafe.word64Dec w) +word64Dec w = fromBounded (Bounded.word64Dec w) -- | Encodes an unsigned 16-bit integer as decimal. -- This encoding never starts with a zero unless the -- argument was zero. word32Dec :: Word32 -> Builder -word32Dec w = fromUnsafe (Unsafe.word32Dec w) +word32Dec w = fromBounded (Bounded.word32Dec w) -- | Encodes an unsigned 16-bit integer as decimal. -- This encoding never starts with a zero unless the -- argument was zero. word16Dec :: Word16 -> Builder -word16Dec w = fromUnsafe (Unsafe.word16Dec w) +word16Dec w = fromBounded (Bounded.word16Dec w) -- | Encode a double-floating-point number, using decimal notation or -- scientific notation depending on the magnitude. This has undefined -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not -- crash, but the generated numbers will be nonsense. doubleDec :: Double -> Builder -doubleDec w = fromUnsafe (Unsafe.doubleDec w) +doubleDec w = fromBounded (Bounded.doubleDec w) -- | Encodes a signed 64-bit integer as decimal. -- This encoding never starts with a zero unless the argument was zero. -- Negative numbers are preceded by a minus sign. Positive numbers -- are not preceded by anything. int64Dec :: Int64 -> Builder -int64Dec w = fromUnsafe (Unsafe.int64Dec w) +int64Dec w = fromBounded (Bounded.int64Dec w) -- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 16 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @00000000000003FE@. word64PaddedUpperHex :: Word64 -> Builder word64PaddedUpperHex w = - fromUnsafe (Unsafe.word64PaddedUpperHex w) + fromBounded (Bounded.word64PaddedUpperHex w) -- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 8 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @000003FE@. word32PaddedUpperHex :: Word32 -> Builder word32PaddedUpperHex w = - fromUnsafe (Unsafe.word32PaddedUpperHex w) + fromBounded (Bounded.word32PaddedUpperHex w) -- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 4 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @03FE@. word16PaddedUpperHex :: Word16 -> Builder word16PaddedUpperHex w = - fromUnsafe (Unsafe.word16PaddedUpperHex w) + fromBounded (Bounded.word16PaddedUpperHex w) -- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 2 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 11 as @0B@. word8PaddedUpperHex :: Word8 -> Builder word8PaddedUpperHex w = - fromUnsafe (Unsafe.word8PaddedUpperHex w) + fromBounded (Bounded.word8PaddedUpperHex w) + +-- | Encode an ASCII char. +-- Precondition: Input must be an ASCII character. This is not checked. +ascii :: Char -> Builder +ascii c = fromBounded (Bounded.char c) + +-- | Encode an UTF8 char. This only uses as much space as is required. +char :: Char -> Builder +char c = fromBounded (Bounded.char c) unST :: ST s a -> State# s -> (# State# s, a #) unST (ST f) = f @@ -328,17 +340,20 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) = -- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- word in a big-endian fashion. word64BE :: Word64 -> Builder -word64BE w = fromUnsafe (Unsafe.word64BE w) +word64BE w = fromBounded (Bounded.word64BE w) -- | Requires exactly 4 bytes. Dump the octets of a 32-bit -- word in a big-endian fashion. word32BE :: Word32 -> Builder -word32BE w = fromUnsafe (Unsafe.word32BE w) +word32BE w = fromBounded (Bounded.word32BE w) -- | Requires exactly 2 bytes. Dump the octets of a 16-bit -- word in a big-endian fashion. word16BE :: Word16 -> Builder -word16BE w = fromUnsafe (Unsafe.word16BE w) +word16BE w = fromBounded (Bounded.word16BE w) + +word8 :: Word8 -> Builder +word8 w = fromBounded (Bounded.word8 w) -- ShortText is already UTF-8 encoded. This is a no-op. shortTextToByteArray :: ShortText -> ByteArray @@ -349,4 +364,4 @@ indexChar8Array :: ByteArray -> Int -> Char indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i) c2w :: Char -> Word8 -c2w = fromIntegral . ord +c2w = fromIntegral . ord diff --git a/src/Data/ByteArray/Builder/Small/Unsafe.hs b/src/Data/ByteArray/Builder/Small/Bounded.hs similarity index 79% rename from src/Data/ByteArray/Builder/Small/Unsafe.hs rename to src/Data/ByteArray/Builder/Small/Bounded.hs index 1ae6695..429e0c5 100644 --- a/src/Data/ByteArray/Builder/Small/Unsafe.hs +++ b/src/Data/ByteArray/Builder/Small/Bounded.hs @@ -1,8 +1,9 @@ -{-# language GADTSyntax #-} +{-# language GADTs #-} {-# language KindSignatures #-} {-# language ScopedTypeVariables #-} {-# language BangPatterns #-} {-# language MagicHash #-} +{-# language BinaryLiterals #-} {-# language UnboxedTuples #-} {-# language RankNTypes #-} {-# language LambdaCase #-} @@ -10,9 +11,8 @@ {-# language DataKinds #-} {-# language TypeApplications #-} --- | The functions in this module do not check to --- see if there is enough space in the buffer. -module Data.ByteArray.Builder.Small.Unsafe +-- | The functions in this module are explict in the amount of bytes they require. +module Data.ByteArray.Builder.Small.Bounded ( -- * Builder Builder(..) , construct @@ -23,6 +23,11 @@ module Data.ByteArray.Builder.Small.Unsafe , pasteIO -- * Combine , append + -- * Bounds Manipulation + , (<=) + , lessThanEqual + , isLessThanEqual + , raise -- * Encode Integral Types -- ** Human-Readable , word64Dec @@ -33,6 +38,8 @@ module Data.ByteArray.Builder.Small.Unsafe , word32PaddedUpperHex , word16PaddedUpperHex , word8PaddedUpperHex + , ascii + , char -- ** Machine-Readable , word64BE , word32BE @@ -53,8 +60,9 @@ import GHC.Word import GHC.Int import Data.Kind import GHC.TypeLits (KnownNat,Nat,type (+),natVal') +import qualified GHC.TypeLits as GHC import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) -import Control.Monad (when) +import qualified Control.Category as Cat import qualified Data.Primitive as PM @@ -65,13 +73,16 @@ newtype Builder :: Nat -> Type where (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)) -> Builder n +knownNat :: KnownNat n => Proxy# n -> Int +knownNat p = fromIntegral (natVal' p) + -- | Execute the builder. This function is safe. run :: forall n. KnownNat n => Builder n -- ^ Builder -> ByteArray {-# inline run #-} run b = runST $ do - arr <- newByteArray (fromIntegral (natVal' (proxy# :: Proxy# n))) + arr <- newByteArray (knownNat (proxy# :: Proxy# n)) len <- pasteST b arr 0 shrinkMutableByteArray arr len unsafeFreezeByteArray arr @@ -96,7 +107,7 @@ pasteGrowST :: forall n s. KnownNat n {-# inline pasteGrowST #-} pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do sz0 <- PM.getSizeofMutableByteArray arr0 - let req = fromIntegral (natVal' (proxy# :: Proxy# n)) + let req = knownNat (proxy# :: Proxy# n) let sz1 = off0 + req if sz1 <= sz0 then do @@ -131,6 +142,33 @@ append (Builder f) (Builder g) = Builder $ \arr off0 s0 -> case f arr off0 s0 of (# s1, r #) -> g arr r s1 +-- | A proof that n is less than or equal to m +newtype (n :: Nat) <= (m :: Nat) = LessThanEqual Int -- m - n + +instance Cat.Category (<=) where + id = LessThanEqual 0 + -- b <= c (c - b) -> a <= b (b - a) -> a <= c (c - a) + LessThanEqual cb . LessThanEqual ba = LessThanEqual (cb + ba) + +-- | Dynamically check than 'n' is less than or equal to 'm' +isLessThanEqual :: (KnownNat n, KnownNat m) => Proxy# n -> Proxy# m -> Maybe (n <= m) +isLessThanEqual n m = if 0 <= diff then Just (LessThanEqual diff) else Nothing + where diff = knownNat m - knownNat n + +-- | Statically check than 'n' is less than or equal to 'm'. 'n' and 'm' must be known at compile time. +lessThanEqual :: forall n m. (KnownNat n, KnownNat m, n GHC.<= m) => n <= m +lessThanEqual = LessThanEqual (knownNat (proxy# :: Proxy# m) - knownNat (proxy# :: Proxy# n)) + +-- | Weaken the bound on the maximum number of bytes required. +-- >>> :{ +-- buildNumber :: Either Double Word64 -> Builder 32 +-- buildNumber = \case +-- Left d -> doubleDec d +-- Right w -> raise lessThanEqual (word64Dec w) +-- :} +raise :: forall m n. n <= m -> Builder n -> Builder m +raise !_ (Builder f) = Builder f + -- | Encode a double-floating-point number, using decimal notation or -- scientific notation depending on the magnitude. This has undefined -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not @@ -302,6 +340,76 @@ word8PaddedUpperHex# w# = construct $ \arr off -> do where w = W# w# +-- | Encode an ASCII char. +-- Precondition: Input must be an ASCII character. This is not checked. +ascii :: Char -> Builder 1 +ascii c = word8 (fromIntegral @Int @Word8 (ord c)) + +-- | Encode an UTF8 char. This only uses as much space as is required. +char :: Char -> Builder 4 +char c + | codepoint < 0x80 = construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 codepoint) + pure (off + 1) + | codepoint < 0x800 = construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 (byteTwoOne codepoint)) + writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint)) + return (off + 2) + | codepoint >= 0xD800 && codepoint < 0xE000 = construct $ \arr off -> do + -- Codepoint U+FFFD + writeByteArray arr off (0xEF :: Word8) + writeByteArray arr (off + 1) (0xBF :: Word8) + writeByteArray arr (off + 2) (0xBD :: Word8) + return (off + 3) + | codepoint < 0x10000 = construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 (byteThreeOne codepoint)) + writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint)) + writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint)) + return (off + 3) + | otherwise = construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 (byteFourOne codepoint)) + writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint)) + writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint)) + writeByteArray arr (off + 3) (unsafeWordToWord8 (byteFourFour codepoint)) + return (off + 4) + + where + codepoint :: Word + codepoint = fromIntegral (ord c) + + unsafeWordToWord8 :: Word -> Word8 + unsafeWordToWord8 (W# w) = W8# w + + -- precondition: codepoint is less than 0x800 + byteTwoOne :: Word -> Word + byteTwoOne w = unsafeShiftR w 6 .|. 0b11000000 + + byteTwoTwo :: Word -> Word + byteTwoTwo w = (w .&. 0b00111111) .|. 0b10000000 + + -- precondition: codepoint is less than 0x1000 + byteThreeOne :: Word -> Word + byteThreeOne w = unsafeShiftR w 12 .|. 0b11100000 + + byteThreeTwo :: Word -> Word + byteThreeTwo w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000 + + byteThreeThree :: Word -> Word + byteThreeThree w = (w .&. 0b00111111) .|. 0b10000000 + + -- precondition: codepoint is less than 0x110000 + byteFourOne :: Word -> Word + byteFourOne w = unsafeShiftR w 18 .|. 0b11110000 + + byteFourTwo :: Word -> Word + byteFourTwo w = (0b00111111 .&. unsafeShiftR w 12) .|. 0b10000000 + + byteFourThree :: Word -> Word + byteFourThree w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000 + + byteFourFour :: Word -> Word + byteFourFour w = (0b00111111 .&. w) .|. 0b10000000 + -- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- word in a big-endian fashion. word64BE :: Word64 -> Builder 8 From 2eb37dfb7ca72b08fb3da15d4bdb7b77b428c18f Mon Sep 17 00:00:00 2001 From: Alice McKean Date: Sun, 1 Sep 2019 22:56:06 -0700 Subject: [PATCH 2/4] add monoidal unit for Bounded Builders --- src/Data/ByteArray/Builder/Small/Bounded.hs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Data/ByteArray/Builder/Small/Bounded.hs b/src/Data/ByteArray/Builder/Small/Bounded.hs index 429e0c5..892b36d 100644 --- a/src/Data/ByteArray/Builder/Small/Bounded.hs +++ b/src/Data/ByteArray/Builder/Small/Bounded.hs @@ -22,6 +22,7 @@ module Data.ByteArray.Builder.Small.Bounded , pasteGrowST , pasteIO -- * Combine + , empty , append -- * Bounds Manipulation , (<=) @@ -136,6 +137,10 @@ construct f = Builder infixr 9 `append` +-- | The monoidal unit of `append` +empty :: Builder 0 +empty = Builder $ \_ off0 s0 -> (# s0, off0 #) + -- | Concatenate two builders. append :: Builder n -> Builder m -> Builder (n + m) append (Builder f) (Builder g) = From 2d1ea682617ad952f184c1f12baea4db564e1702 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Tue, 3 Sep 2019 14:43:58 -0400 Subject: [PATCH 3/4] Remove the word "Small" from module names. Move unsafe functions into modules suffixed with "Unsafe". Write a better IsString instance. Depend on natural-arithmetic. --- bench/Main.hs | 2 +- cabal.project | 5 + common/HexWord64.hs | 2 +- small-bytearray-builder.cabal | 9 +- .../{Builder/Small.hs => Builder.hs} | 36 ++--- .../ByteArray/Builder/{Small => }/Bounded.hs | 149 ++++++------------ src/Data/ByteArray/Builder/Bounded/Unsafe.hs | 59 +++++++ src/Data/ByteArray/Builder/Unsafe.hs | 94 +++++++++++ test/Main.hs | 28 ++-- 9 files changed, 247 insertions(+), 137 deletions(-) create mode 100644 cabal.project rename src/Data/ByteArray/{Builder/Small.hs => Builder.hs} (93%) rename src/Data/ByteArray/Builder/{Small => }/Bounded.hs (82%) create mode 100644 src/Data/ByteArray/Builder/Bounded/Unsafe.hs create mode 100644 src/Data/ByteArray/Builder/Unsafe.hs diff --git a/bench/Main.hs b/bench/Main.hs index a77a65e..28ef227 100644 --- a/bench/Main.hs +++ b/bench/Main.hs @@ -2,7 +2,7 @@ import Gauge (bgroup,bench,whnf) import Gauge.Main (defaultMain) import Data.Word (Word64) import Data.Primitive (ByteArray) -import qualified Data.ByteArray.Builder.Small.Unsafe as U +import qualified Data.ByteArray.Builder.Bounded as U import qualified HexWord64 diff --git a/cabal.project b/cabal.project new file mode 100644 index 0000000..079db62 --- /dev/null +++ b/cabal.project @@ -0,0 +1,5 @@ +packages: . +source-repository-package + type: git + location: https://github.com/andrewthad/natural-arithmetic + tag: 68868c96b58ddaf71bb865b247d2c14c3668f4c2 diff --git a/common/HexWord64.hs b/common/HexWord64.hs index 68ea963..65c5dfa 100644 --- a/common/HexWord64.hs +++ b/common/HexWord64.hs @@ -17,7 +17,7 @@ module HexWord64 import GHC.ST (ST(ST)) import Data.Bits -import Data.ByteArray.Builder.Small.Unsafe (Builder,construct) +import Data.ByteArray.Builder.Bounded.Unsafe (Builder,construct) import Data.Primitive import Data.Word import GHC.Exts diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal index f5e511f..9d613bd 100644 --- a/small-bytearray-builder.cabal +++ b/small-bytearray-builder.cabal @@ -36,8 +36,10 @@ flag checked library exposed-modules: - Data.ByteArray.Builder.Small - Data.ByteArray.Builder.Small.Bounded + Data.ByteArray.Builder + Data.ByteArray.Builder.Unsafe + Data.ByteArray.Builder.Bounded + Data.ByteArray.Builder.Bounded.Unsafe build-depends: , base >=4.12.0.0 && <5 , byteslice >=0.1 && <0.2 @@ -46,6 +48,7 @@ library , vector >=0.12.0.3 && <0.13 , bytestring >=0.10.8.2 && <0.11 , text-short >=0.1.3 && <0.2 + , natural-arithmetic >=0.1 && <0.2 if flag(checked) build-depends: primitive-checked >= 0.7 && <0.8 else @@ -59,6 +62,7 @@ test-suite test type: exitcode-stdio-1.0 hs-source-dirs: test, common main-is: Main.hs + ghc-options: -O2 -Wall other-modules: HexWord64 build-depends: @@ -70,6 +74,7 @@ test-suite test , tasty-quickcheck >=0.10.1 && <0.11 , tasty-hunit >=0.10.0.2 && <0.11 , tasty >=1.2.3 && <1.3 + , text >=1.2 && <1.3 , primitive , vector diff --git a/src/Data/ByteArray/Builder/Small.hs b/src/Data/ByteArray/Builder.hs similarity index 93% rename from src/Data/ByteArray/Builder/Small.hs rename to src/Data/ByteArray/Builder.hs index a9cfabe..ce49b26 100644 --- a/src/Data/ByteArray/Builder/Small.hs +++ b/src/Data/ByteArray/Builder.hs @@ -6,7 +6,7 @@ {-# language ScopedTypeVariables #-} {-# language UnboxedTuples #-} -module Data.ByteArray.Builder.Small +module Data.ByteArray.Builder ( -- * Bounded Primitives Builder(..) , construct @@ -24,6 +24,8 @@ module Data.ByteArray.Builder.Small , bytearray , shortTextUtf8 , shortTextJsonString + , cstring + , stringUtf8 -- * Encode Integral Types -- ** Human-Readable , word64Dec @@ -60,32 +62,15 @@ import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) import Data.ByteString.Short.Internal (ShortByteString(SBS)) import Data.Text.Short (ShortText) import Data.Char (ord) +import Data.ByteArray.Builder.Unsafe (Builder(Builder)) +import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring) import qualified GHC.Exts as Exts import qualified Data.Text.Short as TS import qualified Data.Primitive as PM import qualified Data.Vector as V -import qualified Data.ByteArray.Builder.Small.Bounded as Bounded - --- | An unmaterialized sequence of bytes that may be pasted --- into a mutable byte array. -newtype Builder = Builder - -- This functions takes an offset and a number of remaining bytes - -- and returns the new offset. - (forall s. MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)) - -instance Semigroup Builder where - {-# inline (<>) #-} - Builder f <> Builder g = Builder $ \arr off0 len0 s0 -> case f arr off0 len0 s0 of - (# s1, r #) -> case r /=# (-1#) of - 1# -> g arr r (len0 +# (off0 -# r)) s1 - _ -> (# s1, (-1#) #) - -instance Monoid Builder where - mempty = Builder $ \_ off0 _ s0 -> (# s0, off0 #) - -instance IsString Builder where - fromString = shortTextUtf8 . TS.fromString +import qualified Data.ByteArray.Builder.Bounded as Bounded +import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded -- | Run a builder. An accurate size hint is important for good performance. -- The size hint should be slightly larger than the actual size. @@ -101,7 +86,7 @@ run hint b = runByteArrayST $ do Just len -> do shrinkMutableByteArray arr len unsafeFreezeByteArray arr - go hint + go (max hint 1) -- | Variant of 'pasteArrayST' that runs in 'IO'. pasteArrayIO :: @@ -186,9 +171,10 @@ construct f = Builder Nothing -> (# s1, (-1#) #) Just (I# n) -> (# s1, n #) +-- | Convert a bounded builder to an unbounded one. fromBounded :: forall n. KnownNat n => Bounded.Builder n -> Builder {-# inline fromBounded #-} -fromBounded (Bounded.Builder f) = Builder $ \arr off len s0 -> +fromBounded (UnsafeBounded.Builder f) = Builder $ \arr off len s0 -> case fromIntegral (natVal' (proxy# :: Proxy# n)) of I# req -> case len >=# req of 1# -> f arr off s0 @@ -226,7 +212,7 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1) else do write2 dst doff '\\' 'u' - doff' <- Bounded.pasteST + doff' <- UnsafeBounded.pasteST (Bounded.word16PaddedUpperHex (fromIntegral (c2w c))) dst (doff + 2) go (soff + 1) (slen - 1) doff' diff --git a/src/Data/ByteArray/Builder/Small/Bounded.hs b/src/Data/ByteArray/Builder/Bounded.hs similarity index 82% rename from src/Data/ByteArray/Builder/Small/Bounded.hs rename to src/Data/ByteArray/Builder/Bounded.hs index 892b36d..5db8a12 100644 --- a/src/Data/ByteArray/Builder/Small/Bounded.hs +++ b/src/Data/ByteArray/Builder/Bounded.hs @@ -1,4 +1,3 @@ -{-# language GADTs #-} {-# language KindSignatures #-} {-# language ScopedTypeVariables #-} {-# language BangPatterns #-} @@ -12,23 +11,18 @@ {-# language TypeApplications #-} -- | The functions in this module are explict in the amount of bytes they require. -module Data.ByteArray.Builder.Small.Bounded +module Data.ByteArray.Builder.Bounded ( -- * Builder - Builder(..) - , construct + Builder -- * Execute , run - , pasteST , pasteGrowST - , pasteIO -- * Combine , empty , append -- * Bounds Manipulation - , (<=) - , lessThanEqual - , isLessThanEqual - , raise + , weaken + , substitute -- * Encode Integral Types -- ** Human-Readable , word64Dec @@ -56,46 +50,33 @@ import Data.Bits import Data.Char (ord) import Data.Primitive import GHC.Exts -import GHC.ST -import GHC.Word -import GHC.Int -import Data.Kind -import GHC.TypeLits (KnownNat,Nat,type (+),natVal') -import qualified GHC.TypeLits as GHC +import GHC.ST (ST(ST)) +import GHC.Word (Word8(W8#),Word16(W16#),Word32(W32#),Word64(W64#)) +import GHC.Int (Int64(I64#)) +import GHC.TypeLits (KnownNat,type (+),natVal') import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) -import qualified Control.Category as Cat +import Arithmetic.Types (type (<=), type (:=:)) +import Data.ByteArray.Builder.Bounded.Unsafe (Builder(..)) +import qualified Data.ByteArray.Builder.Bounded.Unsafe as Unsafe import qualified Data.Primitive as PM --- | A builder parameterized by the maximum number of bytes it uses --- when executed. -newtype Builder :: Nat -> Type where - Builder :: - (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)) - -> Builder n - +-- Used internally. knownNat :: KnownNat n => Proxy# n -> Int +{-# inline knownNat #-} knownNat p = fromIntegral (natVal' p) --- | Execute the builder. This function is safe. +-- | Execute the bounded builder. run :: forall n. KnownNat n => Builder n -- ^ Builder -> ByteArray {-# inline run #-} run b = runST $ do arr <- newByteArray (knownNat (proxy# :: Proxy# n)) - len <- pasteST b arr 0 + len <- Unsafe.pasteST b arr 0 shrinkMutableByteArray arr len unsafeFreezeByteArray arr --- | This function does not enforce the known upper bound on the --- size. It is up to the user to do this. -pasteST :: Builder n -> MutableByteArray s -> Int -> ST s Int -{-# inline pasteST #-} -pasteST (Builder f) (MutableByteArray arr) (I# off) = - ST $ \s0 -> case f arr off s0 of - (# s1, r #) -> (# s1, (I# r) #) - -- | Paste the builder into the byte array starting at offset zero. -- This reallocates the byte array if it cannot accomodate the builder, -- growing it by the minimum amount necessary. @@ -112,67 +93,40 @@ pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do let sz1 = off0 + req if sz1 <= sz0 then do - off1 <- pasteST b arr0 off0 + off1 <- Unsafe.pasteST b arr0 off0 pure (MutableByteArrayOffset arr0 off1) else do arr1 <- PM.resizeMutableByteArray arr0 sz1 - off1 <- pasteST b arr1 off0 + off1 <- Unsafe.pasteST b arr1 off0 pure (MutableByteArrayOffset arr1 off1) --- | This function does not enforce the known upper bound on the --- size. It is up to the user to do this. -pasteIO :: Builder n -> MutableByteArray RealWorld -> Int -> IO Int -{-# inline pasteIO #-} -pasteIO b m off = stToIO (pasteST b m off) - --- | Constructor for 'Builder' that works on a function with lifted --- arguments instead of unlifted ones. This is just as unsafe as the --- actual constructor. -construct :: (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder n -{-# inline construct #-} -construct f = Builder - $ \arr off s0 -> - case unST (f (MutableByteArray arr) (I# off)) s0 of - (# s1, (I# n) #) -> (# s1, n #) - -infixr 9 `append` - -- | The monoidal unit of `append` empty :: Builder 0 empty = Builder $ \_ off0 s0 -> (# s0, off0 #) +infixr 9 `append` + -- | Concatenate two builders. -append :: Builder n -> Builder m -> Builder (n + m) +append :: Builder m -> Builder n -> Builder (m + n) append (Builder f) (Builder g) = Builder $ \arr off0 s0 -> case f arr off0 s0 of (# s1, r #) -> g arr r s1 --- | A proof that n is less than or equal to m -newtype (n :: Nat) <= (m :: Nat) = LessThanEqual Int -- m - n +-- | Weaken the bound on the maximum number of bytes required. For example, +-- to use two builders with unequal bounds in a disjunctive setting: +-- +-- > import qualified Arithmetic.Lte as Lte +-- > +-- > buildNumber :: Either Double Word64 -> Builder 32 +-- > buildNumber = \case +-- > Left d -> doubleDec d +-- > Right w -> weaken (Lte.constant @19 @32) (word64Dec w) +weaken :: forall m n. (m <= n) -> Builder m -> Builder n +weaken !_ (Builder f) = Builder f -instance Cat.Category (<=) where - id = LessThanEqual 0 - -- b <= c (c - b) -> a <= b (b - a) -> a <= c (c - a) - LessThanEqual cb . LessThanEqual ba = LessThanEqual (cb + ba) - --- | Dynamically check than 'n' is less than or equal to 'm' -isLessThanEqual :: (KnownNat n, KnownNat m) => Proxy# n -> Proxy# m -> Maybe (n <= m) -isLessThanEqual n m = if 0 <= diff then Just (LessThanEqual diff) else Nothing - where diff = knownNat m - knownNat n - --- | Statically check than 'n' is less than or equal to 'm'. 'n' and 'm' must be known at compile time. -lessThanEqual :: forall n m. (KnownNat n, KnownNat m, n GHC.<= m) => n <= m -lessThanEqual = LessThanEqual (knownNat (proxy# :: Proxy# m) - knownNat (proxy# :: Proxy# n)) - --- | Weaken the bound on the maximum number of bytes required. --- >>> :{ --- buildNumber :: Either Double Word64 -> Builder 32 --- buildNumber = \case --- Left d -> doubleDec d --- Right w -> raise lessThanEqual (word64Dec w) --- :} -raise :: forall m n. n <= m -> Builder n -> Builder m -raise !_ (Builder f) = Builder f +-- | Replace the upper bound on size with an equal number. +substitute :: forall m n. (m :=: n) -> Builder m -> Builder n +substitute !_ (Builder f) = Builder f -- | Encode a double-floating-point number, using decimal notation or -- scientific notation depending on the magnitude. This has undefined @@ -207,7 +161,7 @@ int64Dec (I64# w) = int64Dec# w -- the word. This is only used internally. wordCommonDec# :: Word# -> Builder n {-# noinline wordCommonDec# #-} -wordCommonDec# w# = construct $ \arr off0 -> if w /= 0 +wordCommonDec# w# = Unsafe.construct $ \arr off0 -> if w /= 0 then internalWordLoop arr off0 (W# w#) else do writeByteArray arr off0 (c2w '0') @@ -230,7 +184,7 @@ internalWordLoop arr off0 x0 = go off0 x0 where -- | Requires up to 19 bytes. int64Dec# :: Int# -> Builder 20 {-# noinline int64Dec# #-} -int64Dec# w# = construct $ \arr off0 -> case compare w 0 of +int64Dec# w# = Unsafe.construct $ \arr off0 -> case compare w 0 of GT -> internalWordLoop arr off0 (fromIntegral w) EQ -> do writeByteArray arr off0 (c2w '0') @@ -286,7 +240,7 @@ word8PaddedUpperHex (W8# w) = word8PaddedUpperHex# w -- might not be. Benchmark this. word64PaddedUpperHex# :: Word# -> Builder 16 {-# noinline word64PaddedUpperHex# #-} -word64PaddedUpperHex# w# = construct $ \arr off -> do +word64PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 60)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 56)) writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 52)) @@ -309,7 +263,7 @@ word64PaddedUpperHex# w# = construct $ \arr off -> do word32PaddedUpperHex# :: Word# -> Builder 8 {-# noinline word32PaddedUpperHex# #-} -word32PaddedUpperHex# w# = construct $ \arr off -> do +word32PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 28)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 24)) writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 20)) @@ -326,7 +280,7 @@ word32PaddedUpperHex# w# = construct $ \arr off -> do -- GHC make the decision. Open an issue on github if this is -- a problem. word16PaddedUpperHex# :: Word# -> Builder 4 -word16PaddedUpperHex# w# = construct $ \arr off -> do +word16PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 12)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 8)) writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 4)) @@ -338,7 +292,7 @@ word16PaddedUpperHex# w# = construct $ \arr off -> do -- Definitely want this to inline. It's maybe a dozen instructions total. word8PaddedUpperHex# :: Word# -> Builder 2 {-# inline word8PaddedUpperHex #-} -word8PaddedUpperHex# w# = construct $ \arr off -> do +word8PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 4)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 0)) pure (off + 2) @@ -350,28 +304,28 @@ word8PaddedUpperHex# w# = construct $ \arr off -> do ascii :: Char -> Builder 1 ascii c = word8 (fromIntegral @Int @Word8 (ord c)) --- | Encode an UTF8 char. This only uses as much space as is required. +-- | Encode a character as UTF-8. This only uses as much space as is required. char :: Char -> Builder 4 char c - | codepoint < 0x80 = construct $ \arr off -> do + | codepoint < 0x80 = Unsafe.construct $ \arr off -> do writeByteArray arr off (unsafeWordToWord8 codepoint) pure (off + 1) - | codepoint < 0x800 = construct $ \arr off -> do + | codepoint < 0x800 = Unsafe.construct $ \arr off -> do writeByteArray arr off (unsafeWordToWord8 (byteTwoOne codepoint)) writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint)) return (off + 2) - | codepoint >= 0xD800 && codepoint < 0xE000 = construct $ \arr off -> do + | codepoint >= 0xD800 && codepoint < 0xE000 = Unsafe.construct $ \arr off -> do -- Codepoint U+FFFD writeByteArray arr off (0xEF :: Word8) writeByteArray arr (off + 1) (0xBF :: Word8) writeByteArray arr (off + 2) (0xBD :: Word8) return (off + 3) - | codepoint < 0x10000 = construct $ \arr off -> do + | codepoint < 0x10000 = Unsafe.construct $ \arr off -> do writeByteArray arr off (unsafeWordToWord8 (byteThreeOne codepoint)) writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint)) writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint)) return (off + 3) - | otherwise = construct $ \arr off -> do + | otherwise = Unsafe.construct $ \arr off -> do writeByteArray arr off (unsafeWordToWord8 (byteFourOne codepoint)) writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint)) writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint)) @@ -418,7 +372,7 @@ char c -- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- word in a big-endian fashion. word64BE :: Word64 -> Builder 8 -word64BE w = construct $ \arr off -> do +word64BE w = Unsafe.construct $ \arr off -> do writeByteArray arr (off ) (fromIntegral @Word64 @Word8 (unsafeShiftR w 56)) writeByteArray arr (off + 1) (fromIntegral @Word64 @Word8 (unsafeShiftR w 48)) writeByteArray arr (off + 2) (fromIntegral @Word64 @Word8 (unsafeShiftR w 40)) @@ -432,7 +386,7 @@ word64BE w = construct $ \arr off -> do -- | Requires exactly 4 bytes. Dump the octets of a 32-bit -- word in a big-endian fashion. word32BE :: Word32 -> Builder 4 -word32BE w = construct $ \arr off -> do +word32BE w = Unsafe.construct $ \arr off -> do writeByteArray arr (off ) (fromIntegral @Word32 @Word8 (unsafeShiftR w 24)) writeByteArray arr (off + 1) (fromIntegral @Word32 @Word8 (unsafeShiftR w 16)) writeByteArray arr (off + 2) (fromIntegral @Word32 @Word8 (unsafeShiftR w 8)) @@ -442,13 +396,13 @@ word32BE w = construct $ \arr off -> do -- | Requires exactly 2 bytes. Dump the octets of a 16-bit -- word in a big-endian fashion. word16BE :: Word16 -> Builder 2 -word16BE w = construct $ \arr off -> do +word16BE w = Unsafe.construct $ \arr off -> do writeByteArray arr (off ) (fromIntegral @Word16 @Word8 (unsafeShiftR w 8)) writeByteArray arr (off + 1) (fromIntegral @Word16 @Word8 w) pure (off + 2) word8 :: Word8 -> Builder 1 -word8 w = construct $ \arr off -> do +word8 w = Unsafe.construct $ \arr off -> do writeByteArray arr off w pure (off + 1) @@ -469,9 +423,6 @@ reverseBytes arr begin end = go begin end where c2w :: Char -> Word8 c2w = fromIntegral . ord -unST :: ST s a -> State# s -> (# State# s, a #) -unST (ST f) = f - shrinkMutableByteArray :: MutableByteArray s -> Int -> ST s () shrinkMutableByteArray (MutableByteArray arr) (I# sz) = primitive_ (shrinkMutableByteArray# arr sz) diff --git a/src/Data/ByteArray/Builder/Bounded/Unsafe.hs b/src/Data/ByteArray/Builder/Bounded/Unsafe.hs new file mode 100644 index 0000000..9bad04f --- /dev/null +++ b/src/Data/ByteArray/Builder/Bounded/Unsafe.hs @@ -0,0 +1,59 @@ +{-# language DataKinds #-} +{-# language GADTSyntax #-} +{-# language KindSignatures #-} +{-# language MagicHash #-} +{-# language RankNTypes #-} +{-# language ScopedTypeVariables #-} +{-# language UnboxedTuples #-} + +module Data.ByteArray.Builder.Bounded.Unsafe + ( -- * Types + Builder(..) + -- * Construct + , construct + -- * Run + , pasteST + , pasteIO + ) where + +import GHC.TypeLits (Nat) +import Data.Kind (Type) +import GHC.IO (stToIO) +import GHC.ST (ST(ST)) +import GHC.Exts (Int(I#),RealWorld,Int#,State#,MutableByteArray#) +import Data.Primitive (MutableByteArray(..)) + +-- | A builder parameterized by the maximum number of bytes it uses +-- when executed. +newtype Builder :: Nat -> Type where + Builder :: + (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)) + -> Builder n + +-- | Constructor for 'Builder' that works on a function with lifted +-- arguments instead of unlifted ones. This is just as unsafe as the +-- actual constructor. +construct :: (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder n +{-# inline construct #-} +construct f = Builder + $ \arr off s0 -> + case unST (f (MutableByteArray arr) (I# off)) s0 of + (# s1, (I# n) #) -> (# s1, n #) + +-- | This function does not enforce the known upper bound on the +-- size. It is up to the user to do this. +pasteST :: Builder n -> MutableByteArray s -> Int -> ST s Int +{-# inline pasteST #-} +pasteST (Builder f) (MutableByteArray arr) (I# off) = + ST $ \s0 -> case f arr off s0 of + (# s1, r #) -> (# s1, (I# r) #) + +-- | This function does not enforce the known upper bound on the +-- size. It is up to the user to do this. +pasteIO :: Builder n -> MutableByteArray RealWorld -> Int -> IO Int +{-# inline pasteIO #-} +pasteIO b m off = stToIO (pasteST b m off) + +unST :: ST s a -> State# s -> (# State# s, a #) +unST (ST f) = f + diff --git a/src/Data/ByteArray/Builder/Unsafe.hs b/src/Data/ByteArray/Builder/Unsafe.hs new file mode 100644 index 0000000..18c2eba --- /dev/null +++ b/src/Data/ByteArray/Builder/Unsafe.hs @@ -0,0 +1,94 @@ +{-# language BangPatterns #-} +{-# language DuplicateRecordFields #-} +{-# language LambdaCase #-} +{-# language MagicHash #-} +{-# language RankNTypes #-} +{-# language ScopedTypeVariables #-} +{-# language UnboxedTuples #-} + +module Data.ByteArray.Builder.Unsafe + ( -- * Types + Builder(..) + -- * Safe Functions + -- | These functions are actually completely safe, but they are defined + -- here because they are used by typeclass instances. Import them from + -- @Data.ByteArray.Builder@ instead. + , stringUtf8 + , cstring + ) where + +import Data.Primitive (MutableByteArray(MutableByteArray)) +import Foreign.C.String (CString) +import GHC.Exts ((-#),(+#),(/=#),(>#)) +import GHC.Exts (Addr#,Int(I#),Ptr(Ptr)) +import GHC.Exts (IsString,Int#,State#,MutableByteArray#) +import GHC.ST (ST(ST)) +import GHC.Base (unpackCString#,unpackCStringUtf8#) + +import qualified GHC.Exts as Exts +import qualified Data.ByteArray.Builder.Bounded as Bounded +import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded + +-- | An unmaterialized sequence of bytes that may be pasted +-- into a mutable byte array. +newtype Builder = Builder + -- This functions takes an offset and a number of remaining bytes + -- and returns the new offset. + (forall s. MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)) + +instance IsString Builder where + {-# inline fromString #-} + fromString = stringUtf8 + +instance Semigroup Builder where + {-# inline (<>) #-} + Builder f <> Builder g = Builder $ \arr off0 len0 s0 -> case f arr off0 len0 s0 of + (# s1, r #) -> case r /=# (-1#) of + 1# -> g arr r (len0 +# (off0 -# r)) s1 + _ -> (# s1, (-1#) #) + +instance Monoid Builder where + {-# inline mempty #-} + mempty = Builder $ \_ off0 _ s0 -> (# s0, off0 #) + +-- | Create a builder from a cons-list of 'Char'. These +-- are be UTF-8 encoded. +stringUtf8 :: String -> Builder +{-# inline stringUtf8 #-} +stringUtf8 cs = Builder (\arr off0 len0 s0 -> goString cs arr off0 len0 s0) + +-- | Create a builder from a @NUL@-terminated 'CString'. This ignores any +-- textual encoding, copying bytes until @NUL@ is reached. +cstring :: CString -> Builder +{-# inline cstring #-} +cstring (Ptr cs) = Builder (\arr off0 len0 s0 -> goCString cs arr off0 len0 s0) + +goString :: String -> MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) +{-# noinline goString #-} +goString [] _ off0 _ s0 = (# s0, off0 #) +goString (c : cs) buf off0 len0 s0 = case len0 ># 3# of + 1# -> case unST (UnsafeBounded.pasteST (Bounded.char c) (MutableByteArray buf) (I# off0)) s0 of + (# s1, I# off1 #) -> goString cs buf off1 (len0 -# (off1 -# off0)) s1 + _ -> (# s0, (-1#) #) + +-- We have to have a rule for both unpackCString# and unpackCStringUtf8# +-- since GHC uses a different function based on whether or not non-ASCII +-- codepoints are used in the string. +{-# RULES +"Builder stringUtf8/cstring" forall s a b c d. + goString (unpackCString# s) a b c d = goCString s a b c d +"Builder stringUtf8/cstring-utf8" forall s a b c d. + goString (unpackCStringUtf8# s) a b c d = goCString s a b c d +#-} + +goCString :: Addr# -> MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) +goCString addr buf off0 len0 s0 = case Exts.indexWord8OffAddr# addr 0# of + 0## -> (# s0, off0 #) + w -> case len0 of + 0# -> (# s0, (-1#) #) + _ -> case Exts.writeWord8Array# buf off0 w s0 of + s1 -> goCString (Exts.plusAddr# addr 1# ) buf (off0 +# 1# ) (len0 -# 1# ) s1 + +unST :: ST s a -> State# s -> (# State# s, a #) +unST (ST f) = f + diff --git a/test/Main.hs b/test/Main.hs index 649711e..37b2cb0 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -5,7 +5,7 @@ import Control.Monad.ST (runST) import Data.Bytes.Types (MutableBytes(..)) -import Data.ByteArray.Builder.Small +import Data.ByteArray.Builder import Data.Word import Data.Char (ord) import Data.Primitive (ByteArray) @@ -14,15 +14,19 @@ import Test.Tasty (defaultMain,testGroup,TestTree) import Test.QuickCheck ((===)) import Text.Printf (printf) import Test.Tasty.HUnit ((@=?)) + +import qualified Data.ByteString as ByteString import qualified Data.ByteString.Builder as BB -import qualified Data.Primitive as PM -import qualified Data.List as L -import qualified Data.Vector as V -import qualified Test.Tasty.QuickCheck as TQC -import qualified Test.QuickCheck as QC -import qualified GHC.Exts as Exts import qualified Data.ByteString.Lazy.Char8 as LB +import qualified Data.List as L +import qualified Data.Primitive as PM +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import qualified Data.Vector as V +import qualified GHC.Exts as Exts +import qualified Test.QuickCheck as QC import qualified Test.Tasty.HUnit as THU +import qualified Test.Tasty.QuickCheck as TQC import qualified HexWord64 @@ -54,6 +58,9 @@ tests = testGroup "Tests" (runArray word64Dec (V.fromList xs)) === pack (foldMap show xs) + , THU.testCase "stringUtf8" $ + packUtf8 "¿Cómo estás? I am doing well." @=? + run 1 (stringUtf8 "¿Cómo estás? I am doing well.") , THU.testCase "doubleDec-A" $ pack (show (2 :: Int)) @=? run 1 (doubleDec 2.0) , THU.testCase "doubleDec-B" $ @@ -88,8 +95,8 @@ tests = testGroup "Tests" , testGroup "alternate" [ TQC.testProperty "HexWord64" $ \x y -> run 1 - ( fromUnsafe (HexWord64.word64PaddedUpperHex x) - <> fromUnsafe (HexWord64.word64PaddedUpperHex y) + ( fromBounded (HexWord64.word64PaddedUpperHex x) + <> fromBounded (HexWord64.word64PaddedUpperHex y) ) === pack (showWord64PaddedUpperHex x <> showWord64PaddedUpperHex y) @@ -99,6 +106,9 @@ tests = testGroup "Tests" pack :: String -> ByteArray pack = Exts.fromList . map (fromIntegral @Int @Word8 . ord) +packUtf8 :: String -> ByteArray +packUtf8 = Exts.fromList . ByteString.unpack . TE.encodeUtf8 . T.pack + -- This is used to test pasteArrayST runArray :: (a -> Builder) -- ^ Builder From a5bbf88e71c8506c64a7d135e1b8bd3c7152e9ed Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Tue, 3 Sep 2019 14:58:03 -0400 Subject: [PATCH 4/4] Avoid KnownNat constraints in favor of using Arithmetic.Types.Nat --- bench/Main.hs | 10 ++-- small-bytearray-builder.cabal | 14 +++--- src/Data/ByteArray/Builder.hs | 72 +++++++++++++++------------ src/Data/ByteArray/Builder/Bounded.hs | 57 ++++++++++----------- test/Main.hs | 7 ++- 5 files changed, 85 insertions(+), 75 deletions(-) diff --git a/bench/Main.hs b/bench/Main.hs index 28ef227..65872c4 100644 --- a/bench/Main.hs +++ b/bench/Main.hs @@ -1,7 +1,9 @@ +import Data.Primitive (ByteArray) +import Data.Word (Word64) import Gauge (bgroup,bench,whnf) import Gauge.Main (defaultMain) -import Data.Word (Word64) -import Data.Primitive (ByteArray) + +import qualified Arithmetic.Nat as Nat import qualified Data.ByteArray.Builder.Bounded as U import qualified HexWord64 @@ -33,7 +35,7 @@ data Word64s = Word64s encodeHexWord64s :: Word64s -> ByteArray {-# noinline encodeHexWord64s #-} -encodeHexWord64s (Word64s a b c d e f g h) = U.run $ +encodeHexWord64s (Word64s a b c d e f g h) = U.run Nat.constant $ U.word64PaddedUpperHex a `U.append` U.word64PaddedUpperHex b `U.append` U.word64PaddedUpperHex c `U.append` @@ -45,7 +47,7 @@ encodeHexWord64s (Word64s a b c d e f g h) = U.run $ encodeHexWord64sLoop :: Word64s -> ByteArray {-# noinline encodeHexWord64sLoop #-} -encodeHexWord64sLoop (Word64s a b c d e f g h) = U.run $ +encodeHexWord64sLoop (Word64s a b c d e f g h) = U.run Nat.constant $ HexWord64.word64PaddedUpperHex a `U.append` HexWord64.word64PaddedUpperHex b `U.append` HexWord64.word64PaddedUpperHex c `U.append` diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal index 9d613bd..6373cf7 100644 --- a/small-bytearray-builder.cabal +++ b/small-bytearray-builder.cabal @@ -66,16 +66,17 @@ test-suite test other-modules: HexWord64 build-depends: + , QuickCheck >=2.13.1 && <2.14 , base >=4.12.0.0 && <5 , byteslice , bytestring - , small-bytearray-builder - , QuickCheck >=2.13.1 && <2.14 - , tasty-quickcheck >=0.10.1 && <0.11 - , tasty-hunit >=0.10.0.2 && <0.11 - , tasty >=1.2.3 && <1.3 - , text >=1.2 && <1.3 + , natural-arithmetic , primitive + , small-bytearray-builder + , tasty >=1.2.3 && <1.3 + , tasty-hunit >=0.10.0.2 && <0.11 + , tasty-quickcheck >=0.10.1 && <0.11 + , text >=1.2 && <1.3 , vector benchmark bench @@ -83,6 +84,7 @@ benchmark bench build-depends: , base , gauge >= 0.2.4 + , natural-arithmetic , primitive , small-bytearray-builder ghc-options: -Wall -O2 diff --git a/src/Data/ByteArray/Builder.hs b/src/Data/ByteArray/Builder.hs index ce49b26..9302292 100644 --- a/src/Data/ByteArray/Builder.hs +++ b/src/Data/ByteArray/Builder.hs @@ -51,20 +51,21 @@ module Data.ByteArray.Builder import Control.Monad.Primitive import Control.Monad.ST import Control.Monad.ST.Run (runByteArrayST) -import Data.Bytes.Types -import Data.Primitive -import Data.Int (Int64) -import GHC.Exts -import GHC.ST -import GHC.Word -import GHC.TypeLits (KnownNat,natVal') -import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) -import Data.ByteString.Short.Internal (ShortByteString(SBS)) -import Data.Text.Short (ShortText) -import Data.Char (ord) import Data.ByteArray.Builder.Unsafe (Builder(Builder)) import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring) +import Data.ByteString.Short.Internal (ShortByteString(SBS)) +import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes)) +import Data.Char (ord) +import Data.Int (Int64) +import Data.Primitive +import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) +import Data.Text.Short (ShortText) +import GHC.Exts +import GHC.ST (ST(ST)) +import GHC.Word +import qualified Arithmetic.Nat as Nat +import qualified Arithmetic.Types as Arithmetic import qualified GHC.Exts as Exts import qualified Data.Text.Short as TS import qualified Data.Primitive as PM @@ -171,14 +172,19 @@ construct f = Builder Nothing -> (# s1, (-1#) #) Just (I# n) -> (# s1, n #) --- | Convert a bounded builder to an unbounded one. -fromBounded :: forall n. KnownNat n => Bounded.Builder n -> Builder +-- | Convert a bounded builder to an unbounded one. If the size +-- is a constant, use @Arithmetic.Nat.constant@ as the first argument +-- to let GHC conjure up this value for you. +fromBounded :: + Arithmetic.Nat n + -> Bounded.Builder n + -> Builder {-# inline fromBounded #-} -fromBounded (UnsafeBounded.Builder f) = Builder $ \arr off len s0 -> - case fromIntegral (natVal' (proxy# :: Proxy# n)) of - I# req -> case len >=# req of - 1# -> f arr off s0 - _ -> (# s0, (-1#) #) +fromBounded n (UnsafeBounded.Builder f) = Builder $ \arr off len s0 -> + let !(I# req) = Nat.demote n in + case len >=# req of + 1# -> f arr off s0 + _ -> (# s0, (-1#) #) -- | Create a builder from an unsliced byte sequence. bytearray :: ByteArray -> Builder @@ -251,70 +257,70 @@ shortTextJsonString a = -- This encoding never starts with a zero unless the -- argument was zero. word64Dec :: Word64 -> Builder -word64Dec w = fromBounded (Bounded.word64Dec w) +word64Dec w = fromBounded Nat.constant (Bounded.word64Dec w) -- | Encodes an unsigned 16-bit integer as decimal. -- This encoding never starts with a zero unless the -- argument was zero. word32Dec :: Word32 -> Builder -word32Dec w = fromBounded (Bounded.word32Dec w) +word32Dec w = fromBounded Nat.constant (Bounded.word32Dec w) -- | Encodes an unsigned 16-bit integer as decimal. -- This encoding never starts with a zero unless the -- argument was zero. word16Dec :: Word16 -> Builder -word16Dec w = fromBounded (Bounded.word16Dec w) +word16Dec w = fromBounded Nat.constant (Bounded.word16Dec w) -- | Encode a double-floating-point number, using decimal notation or -- scientific notation depending on the magnitude. This has undefined -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not -- crash, but the generated numbers will be nonsense. doubleDec :: Double -> Builder -doubleDec w = fromBounded (Bounded.doubleDec w) +doubleDec w = fromBounded Nat.constant (Bounded.doubleDec w) -- | Encodes a signed 64-bit integer as decimal. -- This encoding never starts with a zero unless the argument was zero. -- Negative numbers are preceded by a minus sign. Positive numbers -- are not preceded by anything. int64Dec :: Int64 -> Builder -int64Dec w = fromBounded (Bounded.int64Dec w) +int64Dec w = fromBounded Nat.constant (Bounded.int64Dec w) -- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 16 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @00000000000003FE@. word64PaddedUpperHex :: Word64 -> Builder word64PaddedUpperHex w = - fromBounded (Bounded.word64PaddedUpperHex w) + fromBounded Nat.constant (Bounded.word64PaddedUpperHex w) -- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 8 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @000003FE@. word32PaddedUpperHex :: Word32 -> Builder word32PaddedUpperHex w = - fromBounded (Bounded.word32PaddedUpperHex w) + fromBounded Nat.constant (Bounded.word32PaddedUpperHex w) -- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 4 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @03FE@. word16PaddedUpperHex :: Word16 -> Builder word16PaddedUpperHex w = - fromBounded (Bounded.word16PaddedUpperHex w) + fromBounded Nat.constant (Bounded.word16PaddedUpperHex w) -- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 2 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 11 as @0B@. word8PaddedUpperHex :: Word8 -> Builder word8PaddedUpperHex w = - fromBounded (Bounded.word8PaddedUpperHex w) + fromBounded Nat.constant (Bounded.word8PaddedUpperHex w) -- | Encode an ASCII char. -- Precondition: Input must be an ASCII character. This is not checked. ascii :: Char -> Builder -ascii c = fromBounded (Bounded.char c) +ascii c = fromBounded Nat.constant (Bounded.char c) -- | Encode an UTF8 char. This only uses as much space as is required. char :: Char -> Builder -char c = fromBounded (Bounded.char c) +char c = fromBounded Nat.constant (Bounded.char c) unST :: ST s a -> State# s -> (# State# s, a #) unST (ST f) = f @@ -326,20 +332,20 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) = -- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- word in a big-endian fashion. word64BE :: Word64 -> Builder -word64BE w = fromBounded (Bounded.word64BE w) +word64BE w = fromBounded Nat.constant (Bounded.word64BE w) -- | Requires exactly 4 bytes. Dump the octets of a 32-bit -- word in a big-endian fashion. word32BE :: Word32 -> Builder -word32BE w = fromBounded (Bounded.word32BE w) +word32BE w = fromBounded Nat.constant (Bounded.word32BE w) -- | Requires exactly 2 bytes. Dump the octets of a 16-bit -- word in a big-endian fashion. word16BE :: Word16 -> Builder -word16BE w = fromBounded (Bounded.word16BE w) +word16BE w = fromBounded Nat.constant (Bounded.word16BE w) word8 :: Word8 -> Builder -word8 w = fromBounded (Bounded.word8 w) +word8 w = fromBounded Nat.constant (Bounded.word8 w) -- ShortText is already UTF-8 encoded. This is a no-op. shortTextToByteArray :: ShortText -> ByteArray diff --git a/src/Data/ByteArray/Builder/Bounded.hs b/src/Data/ByteArray/Builder/Bounded.hs index 5db8a12..d4bcb52 100644 --- a/src/Data/ByteArray/Builder/Bounded.hs +++ b/src/Data/ByteArray/Builder/Bounded.hs @@ -1,14 +1,14 @@ -{-# language KindSignatures #-} -{-# language ScopedTypeVariables #-} {-# language BangPatterns #-} -{-# language MagicHash #-} {-# language BinaryLiterals #-} -{-# language UnboxedTuples #-} -{-# language RankNTypes #-} -{-# language LambdaCase #-} -{-# language TypeOperators #-} {-# language DataKinds #-} +{-# language KindSignatures #-} +{-# language LambdaCase #-} +{-# language MagicHash #-} +{-# language RankNTypes #-} +{-# language ScopedTypeVariables #-} {-# language TypeApplications #-} +{-# language TypeOperators #-} +{-# language UnboxedTuples #-} -- | The functions in this module are explict in the amount of bytes they require. module Data.ByteArray.Builder.Bounded @@ -44,35 +44,35 @@ module Data.ByteArray.Builder.Bounded , doubleDec ) where +import Arithmetic.Types (type (<=), type (:=:)) import Control.Monad.Primitive import Control.Monad.ST import Data.Bits +import Data.ByteArray.Builder.Bounded.Unsafe (Builder(..)) import Data.Char (ord) import Data.Primitive -import GHC.Exts -import GHC.ST (ST(ST)) -import GHC.Word (Word8(W8#),Word16(W16#),Word32(W32#),Word64(W64#)) -import GHC.Int (Int64(I64#)) -import GHC.TypeLits (KnownNat,type (+),natVal') import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) -import Arithmetic.Types (type (<=), type (:=:)) -import Data.ByteArray.Builder.Bounded.Unsafe (Builder(..)) +import GHC.Exts +import GHC.Int (Int64(I64#)) +import GHC.ST (ST(ST)) +import GHC.TypeLits (type (+)) +import GHC.Word (Word8(W8#),Word16(W16#),Word32(W32#),Word64(W64#)) +import qualified Arithmetic.Types as Arithmetic +import qualified Arithmetic.Nat as Nat import qualified Data.ByteArray.Builder.Bounded.Unsafe as Unsafe import qualified Data.Primitive as PM --- Used internally. -knownNat :: KnownNat n => Proxy# n -> Int -{-# inline knownNat #-} -knownNat p = fromIntegral (natVal' p) - --- | Execute the bounded builder. -run :: forall n. KnownNat n - => Builder n -- ^ Builder +-- | Execute the bounded builder. If the size is a constant, +-- use @Arithmetic.Nat.constant@ as the first argument to let +-- GHC conjure up this value for you. +run :: + Arithmetic.Nat n + -> Builder n -- ^ Builder -> ByteArray {-# inline run #-} -run b = runST $ do - arr <- newByteArray (knownNat (proxy# :: Proxy# n)) +run n b = runST $ do + arr <- newByteArray (Nat.demote n) len <- Unsafe.pasteST b arr 0 shrinkMutableByteArray arr len unsafeFreezeByteArray arr @@ -80,16 +80,17 @@ run b = runST $ do -- | Paste the builder into the byte array starting at offset zero. -- This reallocates the byte array if it cannot accomodate the builder, -- growing it by the minimum amount necessary. -pasteGrowST :: forall n s. KnownNat n - => Builder n +pasteGrowST :: + Arithmetic.Nat n + -> Builder n -> MutableByteArrayOffset s -- ^ Initial buffer, used linearly. Do not reuse this argument. -> ST s (MutableByteArrayOffset s) -- ^ Final buffer that accomodated the builder. {-# inline pasteGrowST #-} -pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do +pasteGrowST n b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do sz0 <- PM.getSizeofMutableByteArray arr0 - let req = knownNat (proxy# :: Proxy# n) + let req = Nat.demote n let sz1 = off0 + req if sz1 <= sz0 then do diff --git a/test/Main.hs b/test/Main.hs index 37b2cb0..6a2e6f8 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -9,12 +9,12 @@ import Data.ByteArray.Builder import Data.Word import Data.Char (ord) import Data.Primitive (ByteArray) -import Debug.Trace import Test.Tasty (defaultMain,testGroup,TestTree) import Test.QuickCheck ((===)) import Text.Printf (printf) import Test.Tasty.HUnit ((@=?)) +import qualified Arithmetic.Nat as Nat import qualified Data.ByteString as ByteString import qualified Data.ByteString.Builder as BB import qualified Data.ByteString.Lazy.Char8 as LB @@ -24,7 +24,6 @@ import qualified Data.Text as T import qualified Data.Text.Encoding as TE import qualified Data.Vector as V import qualified GHC.Exts as Exts -import qualified Test.QuickCheck as QC import qualified Test.Tasty.HUnit as THU import qualified Test.Tasty.QuickCheck as TQC @@ -95,8 +94,8 @@ tests = testGroup "Tests" , testGroup "alternate" [ TQC.testProperty "HexWord64" $ \x y -> run 1 - ( fromBounded (HexWord64.word64PaddedUpperHex x) - <> fromBounded (HexWord64.word64PaddedUpperHex y) + ( fromBounded Nat.constant (HexWord64.word64PaddedUpperHex x) + <> fromBounded Nat.constant (HexWord64.word64PaddedUpperHex y) ) === pack (showWord64PaddedUpperHex x <> showWord64PaddedUpperHex y)