diff --git a/bench/Main.hs b/bench/Main.hs index a77a65e..65872c4 100644 --- a/bench/Main.hs +++ b/bench/Main.hs @@ -1,8 +1,10 @@ +import Data.Primitive (ByteArray) +import Data.Word (Word64) import Gauge (bgroup,bench,whnf) import Gauge.Main (defaultMain) -import Data.Word (Word64) -import Data.Primitive (ByteArray) -import qualified Data.ByteArray.Builder.Small.Unsafe as U + +import qualified Arithmetic.Nat as Nat +import qualified Data.ByteArray.Builder.Bounded as U import qualified HexWord64 @@ -33,7 +35,7 @@ data Word64s = Word64s encodeHexWord64s :: Word64s -> ByteArray {-# noinline encodeHexWord64s #-} -encodeHexWord64s (Word64s a b c d e f g h) = U.run $ +encodeHexWord64s (Word64s a b c d e f g h) = U.run Nat.constant $ U.word64PaddedUpperHex a `U.append` U.word64PaddedUpperHex b `U.append` U.word64PaddedUpperHex c `U.append` @@ -45,7 +47,7 @@ encodeHexWord64s (Word64s a b c d e f g h) = U.run $ encodeHexWord64sLoop :: Word64s -> ByteArray {-# noinline encodeHexWord64sLoop #-} -encodeHexWord64sLoop (Word64s a b c d e f g h) = U.run $ +encodeHexWord64sLoop (Word64s a b c d e f g h) = U.run Nat.constant $ HexWord64.word64PaddedUpperHex a `U.append` HexWord64.word64PaddedUpperHex b `U.append` HexWord64.word64PaddedUpperHex c `U.append` diff --git a/cabal.project b/cabal.project new file mode 100644 index 0000000..079db62 --- /dev/null +++ b/cabal.project @@ -0,0 +1,5 @@ +packages: . +source-repository-package + type: git + location: https://github.com/andrewthad/natural-arithmetic + tag: 68868c96b58ddaf71bb865b247d2c14c3668f4c2 diff --git a/common/HexWord64.hs b/common/HexWord64.hs index 68ea963..65c5dfa 100644 --- a/common/HexWord64.hs +++ b/common/HexWord64.hs @@ -17,7 +17,7 @@ module HexWord64 import GHC.ST (ST(ST)) import Data.Bits -import Data.ByteArray.Builder.Small.Unsafe (Builder,construct) +import Data.ByteArray.Builder.Bounded.Unsafe (Builder,construct) import Data.Primitive import Data.Word import GHC.Exts diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal index 8fe031a..6373cf7 100644 --- a/small-bytearray-builder.cabal +++ b/small-bytearray-builder.cabal @@ -36,8 +36,10 @@ flag checked library exposed-modules: - Data.ByteArray.Builder.Small - Data.ByteArray.Builder.Small.Unsafe + Data.ByteArray.Builder + Data.ByteArray.Builder.Unsafe + Data.ByteArray.Builder.Bounded + Data.ByteArray.Builder.Bounded.Unsafe build-depends: , base >=4.12.0.0 && <5 , byteslice >=0.1 && <0.2 @@ -46,6 +48,7 @@ library , vector >=0.12.0.3 && <0.13 , bytestring >=0.10.8.2 && <0.11 , text-short >=0.1.3 && <0.2 + , natural-arithmetic >=0.1 && <0.2 if flag(checked) build-depends: primitive-checked >= 0.7 && <0.8 else @@ -59,18 +62,21 @@ test-suite test type: exitcode-stdio-1.0 hs-source-dirs: test, common main-is: Main.hs + ghc-options: -O2 -Wall other-modules: HexWord64 build-depends: + , QuickCheck >=2.13.1 && <2.14 , base >=4.12.0.0 && <5 , byteslice , bytestring - , small-bytearray-builder - , QuickCheck >=2.13.1 && <2.14 - , tasty-quickcheck >=0.10.1 && <0.11 - , tasty-hunit >=0.10.0.2 && <0.11 - , tasty >=1.2.3 && <1.3 + , natural-arithmetic , primitive + , small-bytearray-builder + , tasty >=1.2.3 && <1.3 + , tasty-hunit >=0.10.0.2 && <0.11 + , tasty-quickcheck >=0.10.1 && <0.11 + , text >=1.2 && <1.3 , vector benchmark bench @@ -78,6 +84,7 @@ benchmark bench build-depends: , base , gauge >= 0.2.4 + , natural-arithmetic , primitive , small-bytearray-builder ghc-options: -Wall -O2 diff --git a/src/Data/ByteArray/Builder/Small.hs b/src/Data/ByteArray/Builder.hs similarity index 82% rename from src/Data/ByteArray/Builder/Small.hs rename to src/Data/ByteArray/Builder.hs index f2f0e54..9302292 100644 --- a/src/Data/ByteArray/Builder/Small.hs +++ b/src/Data/ByteArray/Builder.hs @@ -6,11 +6,11 @@ {-# language ScopedTypeVariables #-} {-# language UnboxedTuples #-} -module Data.ByteArray.Builder.Small - ( -- * Unsafe Primitives +module Data.ByteArray.Builder + ( -- * Bounded Primitives Builder(..) , construct - , fromUnsafe + , fromBounded -- * Evaluation , run , pasteST @@ -24,6 +24,8 @@ module Data.ByteArray.Builder.Small , bytearray , shortTextUtf8 , shortTextJsonString + , cstring + , stringUtf8 -- * Encode Integral Types -- ** Human-Readable , word64Dec @@ -34,10 +36,13 @@ module Data.ByteArray.Builder.Small , word32PaddedUpperHex , word16PaddedUpperHex , word8PaddedUpperHex + , ascii + , char -- ** Machine-Readable , word64BE , word32BE , word16BE + , word8 -- * Encode Floating-Point Types -- ** Human-Readable , doubleDec @@ -46,43 +51,27 @@ module Data.ByteArray.Builder.Small import Control.Monad.Primitive import Control.Monad.ST import Control.Monad.ST.Run (runByteArrayST) -import Data.Bytes.Types -import Data.Primitive -import Data.Int (Int64) -import GHC.Exts -import GHC.ST -import GHC.Word -import GHC.TypeLits (KnownNat,natVal') -import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) +import Data.ByteArray.Builder.Unsafe (Builder(Builder)) +import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring) import Data.ByteString.Short.Internal (ShortByteString(SBS)) -import Data.Text.Short (ShortText) +import Data.Bytes.Types (Bytes(Bytes),MutableBytes(MutableBytes)) import Data.Char (ord) +import Data.Int (Int64) +import Data.Primitive +import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) +import Data.Text.Short (ShortText) +import GHC.Exts +import GHC.ST (ST(ST)) +import GHC.Word +import qualified Arithmetic.Nat as Nat +import qualified Arithmetic.Types as Arithmetic import qualified GHC.Exts as Exts import qualified Data.Text.Short as TS import qualified Data.Primitive as PM import qualified Data.Vector as V -import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe - --- | An unmaterialized sequence of bytes that may be pasted --- into a mutable byte array. -newtype Builder = Builder - -- This functions takes an offset and a number of remaining bytes - -- and returns the new offset. - (forall s. MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)) - -instance Semigroup Builder where - {-# inline (<>) #-} - Builder f <> Builder g = Builder $ \arr off0 len0 s0 -> case f arr off0 len0 s0 of - (# s1, r #) -> case r /=# (-1#) of - 1# -> g arr r (len0 +# (off0 -# r)) s1 - _ -> (# s1, (-1#) #) - -instance Monoid Builder where - mempty = Builder $ \_ off0 _ s0 -> (# s0, off0 #) - -instance IsString Builder where - fromString = shortTextUtf8 . TS.fromString +import qualified Data.ByteArray.Builder.Bounded as Bounded +import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded -- | Run a builder. An accurate size hint is important for good performance. -- The size hint should be slightly larger than the actual size. @@ -98,7 +87,7 @@ run hint b = runByteArrayST $ do Just len -> do shrinkMutableByteArray arr len unsafeFreezeByteArray arr - go hint + go (max hint 1) -- | Variant of 'pasteArrayST' that runs in 'IO'. pasteArrayIO :: @@ -183,13 +172,19 @@ construct f = Builder Nothing -> (# s1, (-1#) #) Just (I# n) -> (# s1, n #) -fromUnsafe :: forall n. KnownNat n => Unsafe.Builder n -> Builder -{-# inline fromUnsafe #-} -fromUnsafe (Unsafe.Builder f) = Builder $ \arr off len s0 -> - case fromIntegral (natVal' (proxy# :: Proxy# n)) of - I# req -> case len >=# req of - 1# -> f arr off s0 - _ -> (# s0, (-1#) #) +-- | Convert a bounded builder to an unbounded one. If the size +-- is a constant, use @Arithmetic.Nat.constant@ as the first argument +-- to let GHC conjure up this value for you. +fromBounded :: + Arithmetic.Nat n + -> Bounded.Builder n + -> Builder +{-# inline fromBounded #-} +fromBounded n (UnsafeBounded.Builder f) = Builder $ \arr off len s0 -> + let !(I# req) = Nat.demote n in + case len >=# req of + 1# -> f arr off s0 + _ -> (# s0, (-1#) #) -- | Create a builder from an unsliced byte sequence. bytearray :: ByteArray -> Builder @@ -223,8 +218,8 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1) else do write2 dst doff '\\' 'u' - doff' <- Unsafe.pasteST - (Unsafe.word16PaddedUpperHex (fromIntegral (c2w c))) + doff' <- UnsafeBounded.pasteST + (Bounded.word16PaddedUpperHex (fromIntegral (c2w c))) dst (doff + 2) go (soff + 1) (slen - 1) doff' else pure doff @@ -262,61 +257,70 @@ shortTextJsonString a = -- This encoding never starts with a zero unless the -- argument was zero. word64Dec :: Word64 -> Builder -word64Dec w = fromUnsafe (Unsafe.word64Dec w) +word64Dec w = fromBounded Nat.constant (Bounded.word64Dec w) -- | Encodes an unsigned 16-bit integer as decimal. -- This encoding never starts with a zero unless the -- argument was zero. word32Dec :: Word32 -> Builder -word32Dec w = fromUnsafe (Unsafe.word32Dec w) +word32Dec w = fromBounded Nat.constant (Bounded.word32Dec w) -- | Encodes an unsigned 16-bit integer as decimal. -- This encoding never starts with a zero unless the -- argument was zero. word16Dec :: Word16 -> Builder -word16Dec w = fromUnsafe (Unsafe.word16Dec w) +word16Dec w = fromBounded Nat.constant (Bounded.word16Dec w) -- | Encode a double-floating-point number, using decimal notation or -- scientific notation depending on the magnitude. This has undefined -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not -- crash, but the generated numbers will be nonsense. doubleDec :: Double -> Builder -doubleDec w = fromUnsafe (Unsafe.doubleDec w) +doubleDec w = fromBounded Nat.constant (Bounded.doubleDec w) -- | Encodes a signed 64-bit integer as decimal. -- This encoding never starts with a zero unless the argument was zero. -- Negative numbers are preceded by a minus sign. Positive numbers -- are not preceded by anything. int64Dec :: Int64 -> Builder -int64Dec w = fromUnsafe (Unsafe.int64Dec w) +int64Dec w = fromBounded Nat.constant (Bounded.int64Dec w) -- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 16 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @00000000000003FE@. word64PaddedUpperHex :: Word64 -> Builder word64PaddedUpperHex w = - fromUnsafe (Unsafe.word64PaddedUpperHex w) + fromBounded Nat.constant (Bounded.word64PaddedUpperHex w) -- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 8 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @000003FE@. word32PaddedUpperHex :: Word32 -> Builder word32PaddedUpperHex w = - fromUnsafe (Unsafe.word32PaddedUpperHex w) + fromBounded Nat.constant (Bounded.word32PaddedUpperHex w) -- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 4 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 1022 as @03FE@. word16PaddedUpperHex :: Word16 -> Builder word16PaddedUpperHex w = - fromUnsafe (Unsafe.word16PaddedUpperHex w) + fromBounded Nat.constant (Bounded.word16PaddedUpperHex w) -- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding -- the encoding to 2 digits. This uses uppercase for the alphabetical -- digits. For example, this encodes the number 11 as @0B@. word8PaddedUpperHex :: Word8 -> Builder word8PaddedUpperHex w = - fromUnsafe (Unsafe.word8PaddedUpperHex w) + fromBounded Nat.constant (Bounded.word8PaddedUpperHex w) + +-- | Encode an ASCII char. +-- Precondition: Input must be an ASCII character. This is not checked. +ascii :: Char -> Builder +ascii c = fromBounded Nat.constant (Bounded.char c) + +-- | Encode an UTF8 char. This only uses as much space as is required. +char :: Char -> Builder +char c = fromBounded Nat.constant (Bounded.char c) unST :: ST s a -> State# s -> (# State# s, a #) unST (ST f) = f @@ -328,17 +332,20 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) = -- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- word in a big-endian fashion. word64BE :: Word64 -> Builder -word64BE w = fromUnsafe (Unsafe.word64BE w) +word64BE w = fromBounded Nat.constant (Bounded.word64BE w) -- | Requires exactly 4 bytes. Dump the octets of a 32-bit -- word in a big-endian fashion. word32BE :: Word32 -> Builder -word32BE w = fromUnsafe (Unsafe.word32BE w) +word32BE w = fromBounded Nat.constant (Bounded.word32BE w) -- | Requires exactly 2 bytes. Dump the octets of a 16-bit -- word in a big-endian fashion. word16BE :: Word16 -> Builder -word16BE w = fromUnsafe (Unsafe.word16BE w) +word16BE w = fromBounded Nat.constant (Bounded.word16BE w) + +word8 :: Word8 -> Builder +word8 w = fromBounded Nat.constant (Bounded.word8 w) -- ShortText is already UTF-8 encoded. This is a no-op. shortTextToByteArray :: ShortText -> ByteArray @@ -349,4 +356,4 @@ indexChar8Array :: ByteArray -> Int -> Char indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i) c2w :: Char -> Word8 -c2w = fromIntegral . ord +c2w = fromIntegral . ord diff --git a/src/Data/ByteArray/Builder/Small/Unsafe.hs b/src/Data/ByteArray/Builder/Bounded.hs similarity index 73% rename from src/Data/ByteArray/Builder/Small/Unsafe.hs rename to src/Data/ByteArray/Builder/Bounded.hs index 1ae6695..d4bcb52 100644 --- a/src/Data/ByteArray/Builder/Small/Unsafe.hs +++ b/src/Data/ByteArray/Builder/Bounded.hs @@ -1,28 +1,28 @@ -{-# language GADTSyntax #-} -{-# language KindSignatures #-} -{-# language ScopedTypeVariables #-} {-# language BangPatterns #-} -{-# language MagicHash #-} -{-# language UnboxedTuples #-} -{-# language RankNTypes #-} -{-# language LambdaCase #-} -{-# language TypeOperators #-} +{-# language BinaryLiterals #-} {-# language DataKinds #-} +{-# language KindSignatures #-} +{-# language LambdaCase #-} +{-# language MagicHash #-} +{-# language RankNTypes #-} +{-# language ScopedTypeVariables #-} {-# language TypeApplications #-} +{-# language TypeOperators #-} +{-# language UnboxedTuples #-} --- | The functions in this module do not check to --- see if there is enough space in the buffer. -module Data.ByteArray.Builder.Small.Unsafe +-- | The functions in this module are explict in the amount of bytes they require. +module Data.ByteArray.Builder.Bounded ( -- * Builder - Builder(..) - , construct + Builder -- * Execute , run - , pasteST , pasteGrowST - , pasteIO -- * Combine + , empty , append + -- * Bounds Manipulation + , weaken + , substitute -- * Encode Integral Types -- ** Human-Readable , word64Dec @@ -33,6 +33,8 @@ module Data.ByteArray.Builder.Small.Unsafe , word32PaddedUpperHex , word16PaddedUpperHex , word8PaddedUpperHex + , ascii + , char -- ** Machine-Readable , word64BE , word32BE @@ -42,95 +44,91 @@ module Data.ByteArray.Builder.Small.Unsafe , doubleDec ) where +import Arithmetic.Types (type (<=), type (:=:)) import Control.Monad.Primitive import Control.Monad.ST import Data.Bits +import Data.ByteArray.Builder.Bounded.Unsafe (Builder(..)) import Data.Char (ord) import Data.Primitive -import GHC.Exts -import GHC.ST -import GHC.Word -import GHC.Int -import Data.Kind -import GHC.TypeLits (KnownNat,Nat,type (+),natVal') import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) -import Control.Monad (when) +import GHC.Exts +import GHC.Int (Int64(I64#)) +import GHC.ST (ST(ST)) +import GHC.TypeLits (type (+)) +import GHC.Word (Word8(W8#),Word16(W16#),Word32(W32#),Word64(W64#)) +import qualified Arithmetic.Types as Arithmetic +import qualified Arithmetic.Nat as Nat +import qualified Data.ByteArray.Builder.Bounded.Unsafe as Unsafe import qualified Data.Primitive as PM --- | A builder parameterized by the maximum number of bytes it uses --- when executed. -newtype Builder :: Nat -> Type where - Builder :: - (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)) - -> Builder n - --- | Execute the builder. This function is safe. -run :: forall n. KnownNat n - => Builder n -- ^ Builder +-- | Execute the bounded builder. If the size is a constant, +-- use @Arithmetic.Nat.constant@ as the first argument to let +-- GHC conjure up this value for you. +run :: + Arithmetic.Nat n + -> Builder n -- ^ Builder -> ByteArray {-# inline run #-} -run b = runST $ do - arr <- newByteArray (fromIntegral (natVal' (proxy# :: Proxy# n))) - len <- pasteST b arr 0 +run n b = runST $ do + arr <- newByteArray (Nat.demote n) + len <- Unsafe.pasteST b arr 0 shrinkMutableByteArray arr len unsafeFreezeByteArray arr --- | This function does not enforce the known upper bound on the --- size. It is up to the user to do this. -pasteST :: Builder n -> MutableByteArray s -> Int -> ST s Int -{-# inline pasteST #-} -pasteST (Builder f) (MutableByteArray arr) (I# off) = - ST $ \s0 -> case f arr off s0 of - (# s1, r #) -> (# s1, (I# r) #) - -- | Paste the builder into the byte array starting at offset zero. -- This reallocates the byte array if it cannot accomodate the builder, -- growing it by the minimum amount necessary. -pasteGrowST :: forall n s. KnownNat n - => Builder n +pasteGrowST :: + Arithmetic.Nat n + -> Builder n -> MutableByteArrayOffset s -- ^ Initial buffer, used linearly. Do not reuse this argument. -> ST s (MutableByteArrayOffset s) -- ^ Final buffer that accomodated the builder. {-# inline pasteGrowST #-} -pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do +pasteGrowST n b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do sz0 <- PM.getSizeofMutableByteArray arr0 - let req = fromIntegral (natVal' (proxy# :: Proxy# n)) + let req = Nat.demote n let sz1 = off0 + req if sz1 <= sz0 then do - off1 <- pasteST b arr0 off0 + off1 <- Unsafe.pasteST b arr0 off0 pure (MutableByteArrayOffset arr0 off1) else do arr1 <- PM.resizeMutableByteArray arr0 sz1 - off1 <- pasteST b arr1 off0 + off1 <- Unsafe.pasteST b arr1 off0 pure (MutableByteArrayOffset arr1 off1) --- | This function does not enforce the known upper bound on the --- size. It is up to the user to do this. -pasteIO :: Builder n -> MutableByteArray RealWorld -> Int -> IO Int -{-# inline pasteIO #-} -pasteIO b m off = stToIO (pasteST b m off) - --- | Constructor for 'Builder' that works on a function with lifted --- arguments instead of unlifted ones. This is just as unsafe as the --- actual constructor. -construct :: (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder n -{-# inline construct #-} -construct f = Builder - $ \arr off s0 -> - case unST (f (MutableByteArray arr) (I# off)) s0 of - (# s1, (I# n) #) -> (# s1, n #) +-- | The monoidal unit of `append` +empty :: Builder 0 +empty = Builder $ \_ off0 s0 -> (# s0, off0 #) infixr 9 `append` -- | Concatenate two builders. -append :: Builder n -> Builder m -> Builder (n + m) +append :: Builder m -> Builder n -> Builder (m + n) append (Builder f) (Builder g) = Builder $ \arr off0 s0 -> case f arr off0 s0 of (# s1, r #) -> g arr r s1 +-- | Weaken the bound on the maximum number of bytes required. For example, +-- to use two builders with unequal bounds in a disjunctive setting: +-- +-- > import qualified Arithmetic.Lte as Lte +-- > +-- > buildNumber :: Either Double Word64 -> Builder 32 +-- > buildNumber = \case +-- > Left d -> doubleDec d +-- > Right w -> weaken (Lte.constant @19 @32) (word64Dec w) +weaken :: forall m n. (m <= n) -> Builder m -> Builder n +weaken !_ (Builder f) = Builder f + +-- | Replace the upper bound on size with an equal number. +substitute :: forall m n. (m :=: n) -> Builder m -> Builder n +substitute !_ (Builder f) = Builder f + -- | Encode a double-floating-point number, using decimal notation or -- scientific notation depending on the magnitude. This has undefined -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not @@ -164,7 +162,7 @@ int64Dec (I64# w) = int64Dec# w -- the word. This is only used internally. wordCommonDec# :: Word# -> Builder n {-# noinline wordCommonDec# #-} -wordCommonDec# w# = construct $ \arr off0 -> if w /= 0 +wordCommonDec# w# = Unsafe.construct $ \arr off0 -> if w /= 0 then internalWordLoop arr off0 (W# w#) else do writeByteArray arr off0 (c2w '0') @@ -187,7 +185,7 @@ internalWordLoop arr off0 x0 = go off0 x0 where -- | Requires up to 19 bytes. int64Dec# :: Int# -> Builder 20 {-# noinline int64Dec# #-} -int64Dec# w# = construct $ \arr off0 -> case compare w 0 of +int64Dec# w# = Unsafe.construct $ \arr off0 -> case compare w 0 of GT -> internalWordLoop arr off0 (fromIntegral w) EQ -> do writeByteArray arr off0 (c2w '0') @@ -243,7 +241,7 @@ word8PaddedUpperHex (W8# w) = word8PaddedUpperHex# w -- might not be. Benchmark this. word64PaddedUpperHex# :: Word# -> Builder 16 {-# noinline word64PaddedUpperHex# #-} -word64PaddedUpperHex# w# = construct $ \arr off -> do +word64PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 60)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 56)) writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 52)) @@ -266,7 +264,7 @@ word64PaddedUpperHex# w# = construct $ \arr off -> do word32PaddedUpperHex# :: Word# -> Builder 8 {-# noinline word32PaddedUpperHex# #-} -word32PaddedUpperHex# w# = construct $ \arr off -> do +word32PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 28)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 24)) writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 20)) @@ -283,7 +281,7 @@ word32PaddedUpperHex# w# = construct $ \arr off -> do -- GHC make the decision. Open an issue on github if this is -- a problem. word16PaddedUpperHex# :: Word# -> Builder 4 -word16PaddedUpperHex# w# = construct $ \arr off -> do +word16PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 12)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 8)) writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 4)) @@ -295,17 +293,87 @@ word16PaddedUpperHex# w# = construct $ \arr off -> do -- Definitely want this to inline. It's maybe a dozen instructions total. word8PaddedUpperHex# :: Word# -> Builder 2 {-# inline word8PaddedUpperHex #-} -word8PaddedUpperHex# w# = construct $ \arr off -> do +word8PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 4)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 0)) pure (off + 2) where w = W# w# +-- | Encode an ASCII char. +-- Precondition: Input must be an ASCII character. This is not checked. +ascii :: Char -> Builder 1 +ascii c = word8 (fromIntegral @Int @Word8 (ord c)) + +-- | Encode a character as UTF-8. This only uses as much space as is required. +char :: Char -> Builder 4 +char c + | codepoint < 0x80 = Unsafe.construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 codepoint) + pure (off + 1) + | codepoint < 0x800 = Unsafe.construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 (byteTwoOne codepoint)) + writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint)) + return (off + 2) + | codepoint >= 0xD800 && codepoint < 0xE000 = Unsafe.construct $ \arr off -> do + -- Codepoint U+FFFD + writeByteArray arr off (0xEF :: Word8) + writeByteArray arr (off + 1) (0xBF :: Word8) + writeByteArray arr (off + 2) (0xBD :: Word8) + return (off + 3) + | codepoint < 0x10000 = Unsafe.construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 (byteThreeOne codepoint)) + writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint)) + writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint)) + return (off + 3) + | otherwise = Unsafe.construct $ \arr off -> do + writeByteArray arr off (unsafeWordToWord8 (byteFourOne codepoint)) + writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint)) + writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint)) + writeByteArray arr (off + 3) (unsafeWordToWord8 (byteFourFour codepoint)) + return (off + 4) + + where + codepoint :: Word + codepoint = fromIntegral (ord c) + + unsafeWordToWord8 :: Word -> Word8 + unsafeWordToWord8 (W# w) = W8# w + + -- precondition: codepoint is less than 0x800 + byteTwoOne :: Word -> Word + byteTwoOne w = unsafeShiftR w 6 .|. 0b11000000 + + byteTwoTwo :: Word -> Word + byteTwoTwo w = (w .&. 0b00111111) .|. 0b10000000 + + -- precondition: codepoint is less than 0x1000 + byteThreeOne :: Word -> Word + byteThreeOne w = unsafeShiftR w 12 .|. 0b11100000 + + byteThreeTwo :: Word -> Word + byteThreeTwo w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000 + + byteThreeThree :: Word -> Word + byteThreeThree w = (w .&. 0b00111111) .|. 0b10000000 + + -- precondition: codepoint is less than 0x110000 + byteFourOne :: Word -> Word + byteFourOne w = unsafeShiftR w 18 .|. 0b11110000 + + byteFourTwo :: Word -> Word + byteFourTwo w = (0b00111111 .&. unsafeShiftR w 12) .|. 0b10000000 + + byteFourThree :: Word -> Word + byteFourThree w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000 + + byteFourFour :: Word -> Word + byteFourFour w = (0b00111111 .&. w) .|. 0b10000000 + -- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- word in a big-endian fashion. word64BE :: Word64 -> Builder 8 -word64BE w = construct $ \arr off -> do +word64BE w = Unsafe.construct $ \arr off -> do writeByteArray arr (off ) (fromIntegral @Word64 @Word8 (unsafeShiftR w 56)) writeByteArray arr (off + 1) (fromIntegral @Word64 @Word8 (unsafeShiftR w 48)) writeByteArray arr (off + 2) (fromIntegral @Word64 @Word8 (unsafeShiftR w 40)) @@ -319,7 +387,7 @@ word64BE w = construct $ \arr off -> do -- | Requires exactly 4 bytes. Dump the octets of a 32-bit -- word in a big-endian fashion. word32BE :: Word32 -> Builder 4 -word32BE w = construct $ \arr off -> do +word32BE w = Unsafe.construct $ \arr off -> do writeByteArray arr (off ) (fromIntegral @Word32 @Word8 (unsafeShiftR w 24)) writeByteArray arr (off + 1) (fromIntegral @Word32 @Word8 (unsafeShiftR w 16)) writeByteArray arr (off + 2) (fromIntegral @Word32 @Word8 (unsafeShiftR w 8)) @@ -329,13 +397,13 @@ word32BE w = construct $ \arr off -> do -- | Requires exactly 2 bytes. Dump the octets of a 16-bit -- word in a big-endian fashion. word16BE :: Word16 -> Builder 2 -word16BE w = construct $ \arr off -> do +word16BE w = Unsafe.construct $ \arr off -> do writeByteArray arr (off ) (fromIntegral @Word16 @Word8 (unsafeShiftR w 8)) writeByteArray arr (off + 1) (fromIntegral @Word16 @Word8 w) pure (off + 2) word8 :: Word8 -> Builder 1 -word8 w = construct $ \arr off -> do +word8 w = Unsafe.construct $ \arr off -> do writeByteArray arr off w pure (off + 1) @@ -356,9 +424,6 @@ reverseBytes arr begin end = go begin end where c2w :: Char -> Word8 c2w = fromIntegral . ord -unST :: ST s a -> State# s -> (# State# s, a #) -unST (ST f) = f - shrinkMutableByteArray :: MutableByteArray s -> Int -> ST s () shrinkMutableByteArray (MutableByteArray arr) (I# sz) = primitive_ (shrinkMutableByteArray# arr sz) diff --git a/src/Data/ByteArray/Builder/Bounded/Unsafe.hs b/src/Data/ByteArray/Builder/Bounded/Unsafe.hs new file mode 100644 index 0000000..9bad04f --- /dev/null +++ b/src/Data/ByteArray/Builder/Bounded/Unsafe.hs @@ -0,0 +1,59 @@ +{-# language DataKinds #-} +{-# language GADTSyntax #-} +{-# language KindSignatures #-} +{-# language MagicHash #-} +{-# language RankNTypes #-} +{-# language ScopedTypeVariables #-} +{-# language UnboxedTuples #-} + +module Data.ByteArray.Builder.Bounded.Unsafe + ( -- * Types + Builder(..) + -- * Construct + , construct + -- * Run + , pasteST + , pasteIO + ) where + +import GHC.TypeLits (Nat) +import Data.Kind (Type) +import GHC.IO (stToIO) +import GHC.ST (ST(ST)) +import GHC.Exts (Int(I#),RealWorld,Int#,State#,MutableByteArray#) +import Data.Primitive (MutableByteArray(..)) + +-- | A builder parameterized by the maximum number of bytes it uses +-- when executed. +newtype Builder :: Nat -> Type where + Builder :: + (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)) + -> Builder n + +-- | Constructor for 'Builder' that works on a function with lifted +-- arguments instead of unlifted ones. This is just as unsafe as the +-- actual constructor. +construct :: (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder n +{-# inline construct #-} +construct f = Builder + $ \arr off s0 -> + case unST (f (MutableByteArray arr) (I# off)) s0 of + (# s1, (I# n) #) -> (# s1, n #) + +-- | This function does not enforce the known upper bound on the +-- size. It is up to the user to do this. +pasteST :: Builder n -> MutableByteArray s -> Int -> ST s Int +{-# inline pasteST #-} +pasteST (Builder f) (MutableByteArray arr) (I# off) = + ST $ \s0 -> case f arr off s0 of + (# s1, r #) -> (# s1, (I# r) #) + +-- | This function does not enforce the known upper bound on the +-- size. It is up to the user to do this. +pasteIO :: Builder n -> MutableByteArray RealWorld -> Int -> IO Int +{-# inline pasteIO #-} +pasteIO b m off = stToIO (pasteST b m off) + +unST :: ST s a -> State# s -> (# State# s, a #) +unST (ST f) = f + diff --git a/src/Data/ByteArray/Builder/Unsafe.hs b/src/Data/ByteArray/Builder/Unsafe.hs new file mode 100644 index 0000000..18c2eba --- /dev/null +++ b/src/Data/ByteArray/Builder/Unsafe.hs @@ -0,0 +1,94 @@ +{-# language BangPatterns #-} +{-# language DuplicateRecordFields #-} +{-# language LambdaCase #-} +{-# language MagicHash #-} +{-# language RankNTypes #-} +{-# language ScopedTypeVariables #-} +{-# language UnboxedTuples #-} + +module Data.ByteArray.Builder.Unsafe + ( -- * Types + Builder(..) + -- * Safe Functions + -- | These functions are actually completely safe, but they are defined + -- here because they are used by typeclass instances. Import them from + -- @Data.ByteArray.Builder@ instead. + , stringUtf8 + , cstring + ) where + +import Data.Primitive (MutableByteArray(MutableByteArray)) +import Foreign.C.String (CString) +import GHC.Exts ((-#),(+#),(/=#),(>#)) +import GHC.Exts (Addr#,Int(I#),Ptr(Ptr)) +import GHC.Exts (IsString,Int#,State#,MutableByteArray#) +import GHC.ST (ST(ST)) +import GHC.Base (unpackCString#,unpackCStringUtf8#) + +import qualified GHC.Exts as Exts +import qualified Data.ByteArray.Builder.Bounded as Bounded +import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded + +-- | An unmaterialized sequence of bytes that may be pasted +-- into a mutable byte array. +newtype Builder = Builder + -- This functions takes an offset and a number of remaining bytes + -- and returns the new offset. + (forall s. MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)) + +instance IsString Builder where + {-# inline fromString #-} + fromString = stringUtf8 + +instance Semigroup Builder where + {-# inline (<>) #-} + Builder f <> Builder g = Builder $ \arr off0 len0 s0 -> case f arr off0 len0 s0 of + (# s1, r #) -> case r /=# (-1#) of + 1# -> g arr r (len0 +# (off0 -# r)) s1 + _ -> (# s1, (-1#) #) + +instance Monoid Builder where + {-# inline mempty #-} + mempty = Builder $ \_ off0 _ s0 -> (# s0, off0 #) + +-- | Create a builder from a cons-list of 'Char'. These +-- are be UTF-8 encoded. +stringUtf8 :: String -> Builder +{-# inline stringUtf8 #-} +stringUtf8 cs = Builder (\arr off0 len0 s0 -> goString cs arr off0 len0 s0) + +-- | Create a builder from a @NUL@-terminated 'CString'. This ignores any +-- textual encoding, copying bytes until @NUL@ is reached. +cstring :: CString -> Builder +{-# inline cstring #-} +cstring (Ptr cs) = Builder (\arr off0 len0 s0 -> goCString cs arr off0 len0 s0) + +goString :: String -> MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) +{-# noinline goString #-} +goString [] _ off0 _ s0 = (# s0, off0 #) +goString (c : cs) buf off0 len0 s0 = case len0 ># 3# of + 1# -> case unST (UnsafeBounded.pasteST (Bounded.char c) (MutableByteArray buf) (I# off0)) s0 of + (# s1, I# off1 #) -> goString cs buf off1 (len0 -# (off1 -# off0)) s1 + _ -> (# s0, (-1#) #) + +-- We have to have a rule for both unpackCString# and unpackCStringUtf8# +-- since GHC uses a different function based on whether or not non-ASCII +-- codepoints are used in the string. +{-# RULES +"Builder stringUtf8/cstring" forall s a b c d. + goString (unpackCString# s) a b c d = goCString s a b c d +"Builder stringUtf8/cstring-utf8" forall s a b c d. + goString (unpackCStringUtf8# s) a b c d = goCString s a b c d +#-} + +goCString :: Addr# -> MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) +goCString addr buf off0 len0 s0 = case Exts.indexWord8OffAddr# addr 0# of + 0## -> (# s0, off0 #) + w -> case len0 of + 0# -> (# s0, (-1#) #) + _ -> case Exts.writeWord8Array# buf off0 w s0 of + s1 -> goCString (Exts.plusAddr# addr 1# ) buf (off0 +# 1# ) (len0 -# 1# ) s1 + +unST :: ST s a -> State# s -> (# State# s, a #) +unST (ST f) = f + diff --git a/test/Main.hs b/test/Main.hs index 649711e..6a2e6f8 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -5,24 +5,27 @@ import Control.Monad.ST (runST) import Data.Bytes.Types (MutableBytes(..)) -import Data.ByteArray.Builder.Small +import Data.ByteArray.Builder import Data.Word import Data.Char (ord) import Data.Primitive (ByteArray) -import Debug.Trace import Test.Tasty (defaultMain,testGroup,TestTree) import Test.QuickCheck ((===)) import Text.Printf (printf) import Test.Tasty.HUnit ((@=?)) + +import qualified Arithmetic.Nat as Nat +import qualified Data.ByteString as ByteString import qualified Data.ByteString.Builder as BB -import qualified Data.Primitive as PM -import qualified Data.List as L -import qualified Data.Vector as V -import qualified Test.Tasty.QuickCheck as TQC -import qualified Test.QuickCheck as QC -import qualified GHC.Exts as Exts import qualified Data.ByteString.Lazy.Char8 as LB +import qualified Data.List as L +import qualified Data.Primitive as PM +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import qualified Data.Vector as V +import qualified GHC.Exts as Exts import qualified Test.Tasty.HUnit as THU +import qualified Test.Tasty.QuickCheck as TQC import qualified HexWord64 @@ -54,6 +57,9 @@ tests = testGroup "Tests" (runArray word64Dec (V.fromList xs)) === pack (foldMap show xs) + , THU.testCase "stringUtf8" $ + packUtf8 "¿Cómo estás? I am doing well." @=? + run 1 (stringUtf8 "¿Cómo estás? I am doing well.") , THU.testCase "doubleDec-A" $ pack (show (2 :: Int)) @=? run 1 (doubleDec 2.0) , THU.testCase "doubleDec-B" $ @@ -88,8 +94,8 @@ tests = testGroup "Tests" , testGroup "alternate" [ TQC.testProperty "HexWord64" $ \x y -> run 1 - ( fromUnsafe (HexWord64.word64PaddedUpperHex x) - <> fromUnsafe (HexWord64.word64PaddedUpperHex y) + ( fromBounded Nat.constant (HexWord64.word64PaddedUpperHex x) + <> fromBounded Nat.constant (HexWord64.word64PaddedUpperHex y) ) === pack (showWord64PaddedUpperHex x <> showWord64PaddedUpperHex y) @@ -99,6 +105,9 @@ tests = testGroup "Tests" pack :: String -> ByteArray pack = Exts.fromList . map (fromIntegral @Int @Word8 . ord) +packUtf8 :: String -> ByteArray +packUtf8 = Exts.fromList . ByteString.unpack . TE.encodeUtf8 . T.pack + -- This is used to test pasteArrayST runArray :: (a -> Builder) -- ^ Builder