From 2d1ea682617ad952f184c1f12baea4db564e1702 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Tue, 3 Sep 2019 14:43:58 -0400 Subject: [PATCH] Remove the word "Small" from module names. Move unsafe functions into modules suffixed with "Unsafe". Write a better IsString instance. Depend on natural-arithmetic. --- bench/Main.hs | 2 +- cabal.project | 5 + common/HexWord64.hs | 2 +- small-bytearray-builder.cabal | 9 +- .../{Builder/Small.hs => Builder.hs} | 36 ++--- .../ByteArray/Builder/{Small => }/Bounded.hs | 149 ++++++------------ src/Data/ByteArray/Builder/Bounded/Unsafe.hs | 59 +++++++ src/Data/ByteArray/Builder/Unsafe.hs | 94 +++++++++++ test/Main.hs | 28 ++-- 9 files changed, 247 insertions(+), 137 deletions(-) create mode 100644 cabal.project rename src/Data/ByteArray/{Builder/Small.hs => Builder.hs} (93%) rename src/Data/ByteArray/Builder/{Small => }/Bounded.hs (82%) create mode 100644 src/Data/ByteArray/Builder/Bounded/Unsafe.hs create mode 100644 src/Data/ByteArray/Builder/Unsafe.hs diff --git a/bench/Main.hs b/bench/Main.hs index a77a65e..28ef227 100644 --- a/bench/Main.hs +++ b/bench/Main.hs @@ -2,7 +2,7 @@ import Gauge (bgroup,bench,whnf) import Gauge.Main (defaultMain) import Data.Word (Word64) import Data.Primitive (ByteArray) -import qualified Data.ByteArray.Builder.Small.Unsafe as U +import qualified Data.ByteArray.Builder.Bounded as U import qualified HexWord64 diff --git a/cabal.project b/cabal.project new file mode 100644 index 0000000..079db62 --- /dev/null +++ b/cabal.project @@ -0,0 +1,5 @@ +packages: . +source-repository-package + type: git + location: https://github.com/andrewthad/natural-arithmetic + tag: 68868c96b58ddaf71bb865b247d2c14c3668f4c2 diff --git a/common/HexWord64.hs b/common/HexWord64.hs index 68ea963..65c5dfa 100644 --- a/common/HexWord64.hs +++ b/common/HexWord64.hs @@ -17,7 +17,7 @@ module HexWord64 import GHC.ST (ST(ST)) import Data.Bits -import Data.ByteArray.Builder.Small.Unsafe (Builder,construct) +import Data.ByteArray.Builder.Bounded.Unsafe (Builder,construct) import Data.Primitive import Data.Word import GHC.Exts diff --git a/small-bytearray-builder.cabal b/small-bytearray-builder.cabal index f5e511f..9d613bd 100644 --- a/small-bytearray-builder.cabal +++ b/small-bytearray-builder.cabal @@ -36,8 +36,10 @@ flag checked library exposed-modules: - Data.ByteArray.Builder.Small - Data.ByteArray.Builder.Small.Bounded + Data.ByteArray.Builder + Data.ByteArray.Builder.Unsafe + Data.ByteArray.Builder.Bounded + Data.ByteArray.Builder.Bounded.Unsafe build-depends: , base >=4.12.0.0 && <5 , byteslice >=0.1 && <0.2 @@ -46,6 +48,7 @@ library , vector >=0.12.0.3 && <0.13 , bytestring >=0.10.8.2 && <0.11 , text-short >=0.1.3 && <0.2 + , natural-arithmetic >=0.1 && <0.2 if flag(checked) build-depends: primitive-checked >= 0.7 && <0.8 else @@ -59,6 +62,7 @@ test-suite test type: exitcode-stdio-1.0 hs-source-dirs: test, common main-is: Main.hs + ghc-options: -O2 -Wall other-modules: HexWord64 build-depends: @@ -70,6 +74,7 @@ test-suite test , tasty-quickcheck >=0.10.1 && <0.11 , tasty-hunit >=0.10.0.2 && <0.11 , tasty >=1.2.3 && <1.3 + , text >=1.2 && <1.3 , primitive , vector diff --git a/src/Data/ByteArray/Builder/Small.hs b/src/Data/ByteArray/Builder.hs similarity index 93% rename from src/Data/ByteArray/Builder/Small.hs rename to src/Data/ByteArray/Builder.hs index a9cfabe..ce49b26 100644 --- a/src/Data/ByteArray/Builder/Small.hs +++ b/src/Data/ByteArray/Builder.hs @@ -6,7 +6,7 @@ {-# language ScopedTypeVariables #-} {-# language UnboxedTuples #-} -module Data.ByteArray.Builder.Small +module Data.ByteArray.Builder ( -- * Bounded Primitives Builder(..) , construct @@ -24,6 +24,8 @@ module Data.ByteArray.Builder.Small , bytearray , shortTextUtf8 , shortTextJsonString + , cstring + , stringUtf8 -- * Encode Integral Types -- ** Human-Readable , word64Dec @@ -60,32 +62,15 @@ import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) import Data.ByteString.Short.Internal (ShortByteString(SBS)) import Data.Text.Short (ShortText) import Data.Char (ord) +import Data.ByteArray.Builder.Unsafe (Builder(Builder)) +import Data.ByteArray.Builder.Unsafe (stringUtf8,cstring) import qualified GHC.Exts as Exts import qualified Data.Text.Short as TS import qualified Data.Primitive as PM import qualified Data.Vector as V -import qualified Data.ByteArray.Builder.Small.Bounded as Bounded - --- | An unmaterialized sequence of bytes that may be pasted --- into a mutable byte array. -newtype Builder = Builder - -- This functions takes an offset and a number of remaining bytes - -- and returns the new offset. - (forall s. MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)) - -instance Semigroup Builder where - {-# inline (<>) #-} - Builder f <> Builder g = Builder $ \arr off0 len0 s0 -> case f arr off0 len0 s0 of - (# s1, r #) -> case r /=# (-1#) of - 1# -> g arr r (len0 +# (off0 -# r)) s1 - _ -> (# s1, (-1#) #) - -instance Monoid Builder where - mempty = Builder $ \_ off0 _ s0 -> (# s0, off0 #) - -instance IsString Builder where - fromString = shortTextUtf8 . TS.fromString +import qualified Data.ByteArray.Builder.Bounded as Bounded +import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded -- | Run a builder. An accurate size hint is important for good performance. -- The size hint should be slightly larger than the actual size. @@ -101,7 +86,7 @@ run hint b = runByteArrayST $ do Just len -> do shrinkMutableByteArray arr len unsafeFreezeByteArray arr - go hint + go (max hint 1) -- | Variant of 'pasteArrayST' that runs in 'IO'. pasteArrayIO :: @@ -186,9 +171,10 @@ construct f = Builder Nothing -> (# s1, (-1#) #) Just (I# n) -> (# s1, n #) +-- | Convert a bounded builder to an unbounded one. fromBounded :: forall n. KnownNat n => Bounded.Builder n -> Builder {-# inline fromBounded #-} -fromBounded (Bounded.Builder f) = Builder $ \arr off len s0 -> +fromBounded (UnsafeBounded.Builder f) = Builder $ \arr off len s0 -> case fromIntegral (natVal' (proxy# :: Proxy# n)) of I# req -> case len >=# req of 1# -> f arr off s0 @@ -226,7 +212,7 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0 then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1) else do write2 dst doff '\\' 'u' - doff' <- Bounded.pasteST + doff' <- UnsafeBounded.pasteST (Bounded.word16PaddedUpperHex (fromIntegral (c2w c))) dst (doff + 2) go (soff + 1) (slen - 1) doff' diff --git a/src/Data/ByteArray/Builder/Small/Bounded.hs b/src/Data/ByteArray/Builder/Bounded.hs similarity index 82% rename from src/Data/ByteArray/Builder/Small/Bounded.hs rename to src/Data/ByteArray/Builder/Bounded.hs index 892b36d..5db8a12 100644 --- a/src/Data/ByteArray/Builder/Small/Bounded.hs +++ b/src/Data/ByteArray/Builder/Bounded.hs @@ -1,4 +1,3 @@ -{-# language GADTs #-} {-# language KindSignatures #-} {-# language ScopedTypeVariables #-} {-# language BangPatterns #-} @@ -12,23 +11,18 @@ {-# language TypeApplications #-} -- | The functions in this module are explict in the amount of bytes they require. -module Data.ByteArray.Builder.Small.Bounded +module Data.ByteArray.Builder.Bounded ( -- * Builder - Builder(..) - , construct + Builder -- * Execute , run - , pasteST , pasteGrowST - , pasteIO -- * Combine , empty , append -- * Bounds Manipulation - , (<=) - , lessThanEqual - , isLessThanEqual - , raise + , weaken + , substitute -- * Encode Integral Types -- ** Human-Readable , word64Dec @@ -56,46 +50,33 @@ import Data.Bits import Data.Char (ord) import Data.Primitive import GHC.Exts -import GHC.ST -import GHC.Word -import GHC.Int -import Data.Kind -import GHC.TypeLits (KnownNat,Nat,type (+),natVal') -import qualified GHC.TypeLits as GHC +import GHC.ST (ST(ST)) +import GHC.Word (Word8(W8#),Word16(W16#),Word32(W32#),Word64(W64#)) +import GHC.Int (Int64(I64#)) +import GHC.TypeLits (KnownNat,type (+),natVal') import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..)) -import qualified Control.Category as Cat +import Arithmetic.Types (type (<=), type (:=:)) +import Data.ByteArray.Builder.Bounded.Unsafe (Builder(..)) +import qualified Data.ByteArray.Builder.Bounded.Unsafe as Unsafe import qualified Data.Primitive as PM --- | A builder parameterized by the maximum number of bytes it uses --- when executed. -newtype Builder :: Nat -> Type where - Builder :: - (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)) - -> Builder n - +-- Used internally. knownNat :: KnownNat n => Proxy# n -> Int +{-# inline knownNat #-} knownNat p = fromIntegral (natVal' p) --- | Execute the builder. This function is safe. +-- | Execute the bounded builder. run :: forall n. KnownNat n => Builder n -- ^ Builder -> ByteArray {-# inline run #-} run b = runST $ do arr <- newByteArray (knownNat (proxy# :: Proxy# n)) - len <- pasteST b arr 0 + len <- Unsafe.pasteST b arr 0 shrinkMutableByteArray arr len unsafeFreezeByteArray arr --- | This function does not enforce the known upper bound on the --- size. It is up to the user to do this. -pasteST :: Builder n -> MutableByteArray s -> Int -> ST s Int -{-# inline pasteST #-} -pasteST (Builder f) (MutableByteArray arr) (I# off) = - ST $ \s0 -> case f arr off s0 of - (# s1, r #) -> (# s1, (I# r) #) - -- | Paste the builder into the byte array starting at offset zero. -- This reallocates the byte array if it cannot accomodate the builder, -- growing it by the minimum amount necessary. @@ -112,67 +93,40 @@ pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do let sz1 = off0 + req if sz1 <= sz0 then do - off1 <- pasteST b arr0 off0 + off1 <- Unsafe.pasteST b arr0 off0 pure (MutableByteArrayOffset arr0 off1) else do arr1 <- PM.resizeMutableByteArray arr0 sz1 - off1 <- pasteST b arr1 off0 + off1 <- Unsafe.pasteST b arr1 off0 pure (MutableByteArrayOffset arr1 off1) --- | This function does not enforce the known upper bound on the --- size. It is up to the user to do this. -pasteIO :: Builder n -> MutableByteArray RealWorld -> Int -> IO Int -{-# inline pasteIO #-} -pasteIO b m off = stToIO (pasteST b m off) - --- | Constructor for 'Builder' that works on a function with lifted --- arguments instead of unlifted ones. This is just as unsafe as the --- actual constructor. -construct :: (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder n -{-# inline construct #-} -construct f = Builder - $ \arr off s0 -> - case unST (f (MutableByteArray arr) (I# off)) s0 of - (# s1, (I# n) #) -> (# s1, n #) - -infixr 9 `append` - -- | The monoidal unit of `append` empty :: Builder 0 empty = Builder $ \_ off0 s0 -> (# s0, off0 #) +infixr 9 `append` + -- | Concatenate two builders. -append :: Builder n -> Builder m -> Builder (n + m) +append :: Builder m -> Builder n -> Builder (m + n) append (Builder f) (Builder g) = Builder $ \arr off0 s0 -> case f arr off0 s0 of (# s1, r #) -> g arr r s1 --- | A proof that n is less than or equal to m -newtype (n :: Nat) <= (m :: Nat) = LessThanEqual Int -- m - n +-- | Weaken the bound on the maximum number of bytes required. For example, +-- to use two builders with unequal bounds in a disjunctive setting: +-- +-- > import qualified Arithmetic.Lte as Lte +-- > +-- > buildNumber :: Either Double Word64 -> Builder 32 +-- > buildNumber = \case +-- > Left d -> doubleDec d +-- > Right w -> weaken (Lte.constant @19 @32) (word64Dec w) +weaken :: forall m n. (m <= n) -> Builder m -> Builder n +weaken !_ (Builder f) = Builder f -instance Cat.Category (<=) where - id = LessThanEqual 0 - -- b <= c (c - b) -> a <= b (b - a) -> a <= c (c - a) - LessThanEqual cb . LessThanEqual ba = LessThanEqual (cb + ba) - --- | Dynamically check than 'n' is less than or equal to 'm' -isLessThanEqual :: (KnownNat n, KnownNat m) => Proxy# n -> Proxy# m -> Maybe (n <= m) -isLessThanEqual n m = if 0 <= diff then Just (LessThanEqual diff) else Nothing - where diff = knownNat m - knownNat n - --- | Statically check than 'n' is less than or equal to 'm'. 'n' and 'm' must be known at compile time. -lessThanEqual :: forall n m. (KnownNat n, KnownNat m, n GHC.<= m) => n <= m -lessThanEqual = LessThanEqual (knownNat (proxy# :: Proxy# m) - knownNat (proxy# :: Proxy# n)) - --- | Weaken the bound on the maximum number of bytes required. --- >>> :{ --- buildNumber :: Either Double Word64 -> Builder 32 --- buildNumber = \case --- Left d -> doubleDec d --- Right w -> raise lessThanEqual (word64Dec w) --- :} -raise :: forall m n. n <= m -> Builder n -> Builder m -raise !_ (Builder f) = Builder f +-- | Replace the upper bound on size with an equal number. +substitute :: forall m n. (m :=: n) -> Builder m -> Builder n +substitute !_ (Builder f) = Builder f -- | Encode a double-floating-point number, using decimal notation or -- scientific notation depending on the magnitude. This has undefined @@ -207,7 +161,7 @@ int64Dec (I64# w) = int64Dec# w -- the word. This is only used internally. wordCommonDec# :: Word# -> Builder n {-# noinline wordCommonDec# #-} -wordCommonDec# w# = construct $ \arr off0 -> if w /= 0 +wordCommonDec# w# = Unsafe.construct $ \arr off0 -> if w /= 0 then internalWordLoop arr off0 (W# w#) else do writeByteArray arr off0 (c2w '0') @@ -230,7 +184,7 @@ internalWordLoop arr off0 x0 = go off0 x0 where -- | Requires up to 19 bytes. int64Dec# :: Int# -> Builder 20 {-# noinline int64Dec# #-} -int64Dec# w# = construct $ \arr off0 -> case compare w 0 of +int64Dec# w# = Unsafe.construct $ \arr off0 -> case compare w 0 of GT -> internalWordLoop arr off0 (fromIntegral w) EQ -> do writeByteArray arr off0 (c2w '0') @@ -286,7 +240,7 @@ word8PaddedUpperHex (W8# w) = word8PaddedUpperHex# w -- might not be. Benchmark this. word64PaddedUpperHex# :: Word# -> Builder 16 {-# noinline word64PaddedUpperHex# #-} -word64PaddedUpperHex# w# = construct $ \arr off -> do +word64PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 60)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 56)) writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 52)) @@ -309,7 +263,7 @@ word64PaddedUpperHex# w# = construct $ \arr off -> do word32PaddedUpperHex# :: Word# -> Builder 8 {-# noinline word32PaddedUpperHex# #-} -word32PaddedUpperHex# w# = construct $ \arr off -> do +word32PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 28)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 24)) writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 20)) @@ -326,7 +280,7 @@ word32PaddedUpperHex# w# = construct $ \arr off -> do -- GHC make the decision. Open an issue on github if this is -- a problem. word16PaddedUpperHex# :: Word# -> Builder 4 -word16PaddedUpperHex# w# = construct $ \arr off -> do +word16PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 12)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 8)) writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 4)) @@ -338,7 +292,7 @@ word16PaddedUpperHex# w# = construct $ \arr off -> do -- Definitely want this to inline. It's maybe a dozen instructions total. word8PaddedUpperHex# :: Word# -> Builder 2 {-# inline word8PaddedUpperHex #-} -word8PaddedUpperHex# w# = construct $ \arr off -> do +word8PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do writeByteArray arr off (toHexUpper (unsafeShiftR w 4)) writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 0)) pure (off + 2) @@ -350,28 +304,28 @@ word8PaddedUpperHex# w# = construct $ \arr off -> do ascii :: Char -> Builder 1 ascii c = word8 (fromIntegral @Int @Word8 (ord c)) --- | Encode an UTF8 char. This only uses as much space as is required. +-- | Encode a character as UTF-8. This only uses as much space as is required. char :: Char -> Builder 4 char c - | codepoint < 0x80 = construct $ \arr off -> do + | codepoint < 0x80 = Unsafe.construct $ \arr off -> do writeByteArray arr off (unsafeWordToWord8 codepoint) pure (off + 1) - | codepoint < 0x800 = construct $ \arr off -> do + | codepoint < 0x800 = Unsafe.construct $ \arr off -> do writeByteArray arr off (unsafeWordToWord8 (byteTwoOne codepoint)) writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint)) return (off + 2) - | codepoint >= 0xD800 && codepoint < 0xE000 = construct $ \arr off -> do + | codepoint >= 0xD800 && codepoint < 0xE000 = Unsafe.construct $ \arr off -> do -- Codepoint U+FFFD writeByteArray arr off (0xEF :: Word8) writeByteArray arr (off + 1) (0xBF :: Word8) writeByteArray arr (off + 2) (0xBD :: Word8) return (off + 3) - | codepoint < 0x10000 = construct $ \arr off -> do + | codepoint < 0x10000 = Unsafe.construct $ \arr off -> do writeByteArray arr off (unsafeWordToWord8 (byteThreeOne codepoint)) writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint)) writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint)) return (off + 3) - | otherwise = construct $ \arr off -> do + | otherwise = Unsafe.construct $ \arr off -> do writeByteArray arr off (unsafeWordToWord8 (byteFourOne codepoint)) writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint)) writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint)) @@ -418,7 +372,7 @@ char c -- | Requires exactly 8 bytes. Dump the octets of a 64-bit -- word in a big-endian fashion. word64BE :: Word64 -> Builder 8 -word64BE w = construct $ \arr off -> do +word64BE w = Unsafe.construct $ \arr off -> do writeByteArray arr (off ) (fromIntegral @Word64 @Word8 (unsafeShiftR w 56)) writeByteArray arr (off + 1) (fromIntegral @Word64 @Word8 (unsafeShiftR w 48)) writeByteArray arr (off + 2) (fromIntegral @Word64 @Word8 (unsafeShiftR w 40)) @@ -432,7 +386,7 @@ word64BE w = construct $ \arr off -> do -- | Requires exactly 4 bytes. Dump the octets of a 32-bit -- word in a big-endian fashion. word32BE :: Word32 -> Builder 4 -word32BE w = construct $ \arr off -> do +word32BE w = Unsafe.construct $ \arr off -> do writeByteArray arr (off ) (fromIntegral @Word32 @Word8 (unsafeShiftR w 24)) writeByteArray arr (off + 1) (fromIntegral @Word32 @Word8 (unsafeShiftR w 16)) writeByteArray arr (off + 2) (fromIntegral @Word32 @Word8 (unsafeShiftR w 8)) @@ -442,13 +396,13 @@ word32BE w = construct $ \arr off -> do -- | Requires exactly 2 bytes. Dump the octets of a 16-bit -- word in a big-endian fashion. word16BE :: Word16 -> Builder 2 -word16BE w = construct $ \arr off -> do +word16BE w = Unsafe.construct $ \arr off -> do writeByteArray arr (off ) (fromIntegral @Word16 @Word8 (unsafeShiftR w 8)) writeByteArray arr (off + 1) (fromIntegral @Word16 @Word8 w) pure (off + 2) word8 :: Word8 -> Builder 1 -word8 w = construct $ \arr off -> do +word8 w = Unsafe.construct $ \arr off -> do writeByteArray arr off w pure (off + 1) @@ -469,9 +423,6 @@ reverseBytes arr begin end = go begin end where c2w :: Char -> Word8 c2w = fromIntegral . ord -unST :: ST s a -> State# s -> (# State# s, a #) -unST (ST f) = f - shrinkMutableByteArray :: MutableByteArray s -> Int -> ST s () shrinkMutableByteArray (MutableByteArray arr) (I# sz) = primitive_ (shrinkMutableByteArray# arr sz) diff --git a/src/Data/ByteArray/Builder/Bounded/Unsafe.hs b/src/Data/ByteArray/Builder/Bounded/Unsafe.hs new file mode 100644 index 0000000..9bad04f --- /dev/null +++ b/src/Data/ByteArray/Builder/Bounded/Unsafe.hs @@ -0,0 +1,59 @@ +{-# language DataKinds #-} +{-# language GADTSyntax #-} +{-# language KindSignatures #-} +{-# language MagicHash #-} +{-# language RankNTypes #-} +{-# language ScopedTypeVariables #-} +{-# language UnboxedTuples #-} + +module Data.ByteArray.Builder.Bounded.Unsafe + ( -- * Types + Builder(..) + -- * Construct + , construct + -- * Run + , pasteST + , pasteIO + ) where + +import GHC.TypeLits (Nat) +import Data.Kind (Type) +import GHC.IO (stToIO) +import GHC.ST (ST(ST)) +import GHC.Exts (Int(I#),RealWorld,Int#,State#,MutableByteArray#) +import Data.Primitive (MutableByteArray(..)) + +-- | A builder parameterized by the maximum number of bytes it uses +-- when executed. +newtype Builder :: Nat -> Type where + Builder :: + (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)) + -> Builder n + +-- | Constructor for 'Builder' that works on a function with lifted +-- arguments instead of unlifted ones. This is just as unsafe as the +-- actual constructor. +construct :: (forall s. MutableByteArray s -> Int -> ST s Int) -> Builder n +{-# inline construct #-} +construct f = Builder + $ \arr off s0 -> + case unST (f (MutableByteArray arr) (I# off)) s0 of + (# s1, (I# n) #) -> (# s1, n #) + +-- | This function does not enforce the known upper bound on the +-- size. It is up to the user to do this. +pasteST :: Builder n -> MutableByteArray s -> Int -> ST s Int +{-# inline pasteST #-} +pasteST (Builder f) (MutableByteArray arr) (I# off) = + ST $ \s0 -> case f arr off s0 of + (# s1, r #) -> (# s1, (I# r) #) + +-- | This function does not enforce the known upper bound on the +-- size. It is up to the user to do this. +pasteIO :: Builder n -> MutableByteArray RealWorld -> Int -> IO Int +{-# inline pasteIO #-} +pasteIO b m off = stToIO (pasteST b m off) + +unST :: ST s a -> State# s -> (# State# s, a #) +unST (ST f) = f + diff --git a/src/Data/ByteArray/Builder/Unsafe.hs b/src/Data/ByteArray/Builder/Unsafe.hs new file mode 100644 index 0000000..18c2eba --- /dev/null +++ b/src/Data/ByteArray/Builder/Unsafe.hs @@ -0,0 +1,94 @@ +{-# language BangPatterns #-} +{-# language DuplicateRecordFields #-} +{-# language LambdaCase #-} +{-# language MagicHash #-} +{-# language RankNTypes #-} +{-# language ScopedTypeVariables #-} +{-# language UnboxedTuples #-} + +module Data.ByteArray.Builder.Unsafe + ( -- * Types + Builder(..) + -- * Safe Functions + -- | These functions are actually completely safe, but they are defined + -- here because they are used by typeclass instances. Import them from + -- @Data.ByteArray.Builder@ instead. + , stringUtf8 + , cstring + ) where + +import Data.Primitive (MutableByteArray(MutableByteArray)) +import Foreign.C.String (CString) +import GHC.Exts ((-#),(+#),(/=#),(>#)) +import GHC.Exts (Addr#,Int(I#),Ptr(Ptr)) +import GHC.Exts (IsString,Int#,State#,MutableByteArray#) +import GHC.ST (ST(ST)) +import GHC.Base (unpackCString#,unpackCStringUtf8#) + +import qualified GHC.Exts as Exts +import qualified Data.ByteArray.Builder.Bounded as Bounded +import qualified Data.ByteArray.Builder.Bounded.Unsafe as UnsafeBounded + +-- | An unmaterialized sequence of bytes that may be pasted +-- into a mutable byte array. +newtype Builder = Builder + -- This functions takes an offset and a number of remaining bytes + -- and returns the new offset. + (forall s. MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)) + +instance IsString Builder where + {-# inline fromString #-} + fromString = stringUtf8 + +instance Semigroup Builder where + {-# inline (<>) #-} + Builder f <> Builder g = Builder $ \arr off0 len0 s0 -> case f arr off0 len0 s0 of + (# s1, r #) -> case r /=# (-1#) of + 1# -> g arr r (len0 +# (off0 -# r)) s1 + _ -> (# s1, (-1#) #) + +instance Monoid Builder where + {-# inline mempty #-} + mempty = Builder $ \_ off0 _ s0 -> (# s0, off0 #) + +-- | Create a builder from a cons-list of 'Char'. These +-- are be UTF-8 encoded. +stringUtf8 :: String -> Builder +{-# inline stringUtf8 #-} +stringUtf8 cs = Builder (\arr off0 len0 s0 -> goString cs arr off0 len0 s0) + +-- | Create a builder from a @NUL@-terminated 'CString'. This ignores any +-- textual encoding, copying bytes until @NUL@ is reached. +cstring :: CString -> Builder +{-# inline cstring #-} +cstring (Ptr cs) = Builder (\arr off0 len0 s0 -> goCString cs arr off0 len0 s0) + +goString :: String -> MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) +{-# noinline goString #-} +goString [] _ off0 _ s0 = (# s0, off0 #) +goString (c : cs) buf off0 len0 s0 = case len0 ># 3# of + 1# -> case unST (UnsafeBounded.pasteST (Bounded.char c) (MutableByteArray buf) (I# off0)) s0 of + (# s1, I# off1 #) -> goString cs buf off1 (len0 -# (off1 -# off0)) s1 + _ -> (# s0, (-1#) #) + +-- We have to have a rule for both unpackCString# and unpackCStringUtf8# +-- since GHC uses a different function based on whether or not non-ASCII +-- codepoints are used in the string. +{-# RULES +"Builder stringUtf8/cstring" forall s a b c d. + goString (unpackCString# s) a b c d = goCString s a b c d +"Builder stringUtf8/cstring-utf8" forall s a b c d. + goString (unpackCStringUtf8# s) a b c d = goCString s a b c d +#-} + +goCString :: Addr# -> MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) +goCString addr buf off0 len0 s0 = case Exts.indexWord8OffAddr# addr 0# of + 0## -> (# s0, off0 #) + w -> case len0 of + 0# -> (# s0, (-1#) #) + _ -> case Exts.writeWord8Array# buf off0 w s0 of + s1 -> goCString (Exts.plusAddr# addr 1# ) buf (off0 +# 1# ) (len0 -# 1# ) s1 + +unST :: ST s a -> State# s -> (# State# s, a #) +unST (ST f) = f + diff --git a/test/Main.hs b/test/Main.hs index 649711e..37b2cb0 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -5,7 +5,7 @@ import Control.Monad.ST (runST) import Data.Bytes.Types (MutableBytes(..)) -import Data.ByteArray.Builder.Small +import Data.ByteArray.Builder import Data.Word import Data.Char (ord) import Data.Primitive (ByteArray) @@ -14,15 +14,19 @@ import Test.Tasty (defaultMain,testGroup,TestTree) import Test.QuickCheck ((===)) import Text.Printf (printf) import Test.Tasty.HUnit ((@=?)) + +import qualified Data.ByteString as ByteString import qualified Data.ByteString.Builder as BB -import qualified Data.Primitive as PM -import qualified Data.List as L -import qualified Data.Vector as V -import qualified Test.Tasty.QuickCheck as TQC -import qualified Test.QuickCheck as QC -import qualified GHC.Exts as Exts import qualified Data.ByteString.Lazy.Char8 as LB +import qualified Data.List as L +import qualified Data.Primitive as PM +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import qualified Data.Vector as V +import qualified GHC.Exts as Exts +import qualified Test.QuickCheck as QC import qualified Test.Tasty.HUnit as THU +import qualified Test.Tasty.QuickCheck as TQC import qualified HexWord64 @@ -54,6 +58,9 @@ tests = testGroup "Tests" (runArray word64Dec (V.fromList xs)) === pack (foldMap show xs) + , THU.testCase "stringUtf8" $ + packUtf8 "¿Cómo estás? I am doing well." @=? + run 1 (stringUtf8 "¿Cómo estás? I am doing well.") , THU.testCase "doubleDec-A" $ pack (show (2 :: Int)) @=? run 1 (doubleDec 2.0) , THU.testCase "doubleDec-B" $ @@ -88,8 +95,8 @@ tests = testGroup "Tests" , testGroup "alternate" [ TQC.testProperty "HexWord64" $ \x y -> run 1 - ( fromUnsafe (HexWord64.word64PaddedUpperHex x) - <> fromUnsafe (HexWord64.word64PaddedUpperHex y) + ( fromBounded (HexWord64.word64PaddedUpperHex x) + <> fromBounded (HexWord64.word64PaddedUpperHex y) ) === pack (showWord64PaddedUpperHex x <> showWord64PaddedUpperHex y) @@ -99,6 +106,9 @@ tests = testGroup "Tests" pack :: String -> ByteArray pack = Exts.fromList . map (fromIntegral @Int @Word8 . ord) +packUtf8 :: String -> ByteArray +packUtf8 = Exts.fromList . ByteString.unpack . TE.encodeUtf8 . T.pack + -- This is used to test pasteArrayST runArray :: (a -> Builder) -- ^ Builder