switch unsafe to bounded + char encoding

2019-09-01 22:43:54 -07:00 · 2019-09-01 22:43:54 -07:00 · 9a14600a38
commit 9a14600a38
parent 22dfde2936
3 changed files with 152 additions and 29 deletions
--- a/small-bytearray-builder.cabal
+++ b/small-bytearray-builder.cabal
@ -37,7 +37,7 @@ flag checked
 library
  exposed-modules:
    Data.ByteArray.Builder.Small
-    Data.ByteArray.Builder.Small.Unsafe
+    Data.ByteArray.Builder.Small.Bounded
  build-depends:
    , base >=4.12.0.0 && <5
    , byteslice >=0.1 && <0.2
--- a/src/Data/ByteArray/Builder/Small.hs
+++ b/src/Data/ByteArray/Builder/Small.hs
@ -7,10 +7,10 @@
 {-# language UnboxedTuples #-}
 module Data.ByteArray.Builder.Small
-  ( -- * Unsafe Primitives
+  ( -- * Bounded Primitives
    Builder(..)
  , construct
-  , fromUnsafe
+  , fromBounded
    -- * Evaluation
  , run
  , pasteST
@ -34,10 +34,13 @@ module Data.ByteArray.Builder.Small
  , word32PaddedUpperHex
  , word16PaddedUpperHex
  , word8PaddedUpperHex
  , ascii
  , char
    -- ** Machine-Readable
  , word64BE
  , word32BE
  , word16BE
  , word8
    -- * Encode Floating-Point Types
    -- ** Human-Readable
  , doubleDec
@ -62,7 +65,7 @@ import qualified GHC.Exts as Exts
 import qualified Data.Text.Short as TS
 import qualified Data.Primitive as PM
 import qualified Data.Vector as V
-import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
+import qualified Data.ByteArray.Builder.Small.Bounded as Bounded
 -- | An unmaterialized sequence of bytes that may be pasted
 -- into a mutable byte array.
@ -183,9 +186,9 @@ construct f = Builder
        Nothing -> (# s1, (-1#) #)
        Just (I# n) -> (# s1, n #)
-fromUnsafe :: forall n. KnownNat n => Unsafe.Builder n -> Builder
+fromBounded :: forall n. KnownNat n => Bounded.Builder n -> Builder
-{-# inline fromUnsafe #-}
+{-# inline fromBounded #-}
-fromUnsafe (Unsafe.Builder f) = Builder $ \arr off len s0 ->
+fromBounded (Bounded.Builder f) = Builder $ \arr off len s0 ->
  case fromIntegral (natVal' (proxy# :: Proxy# n)) of
    I# req -> case len >=# req of
      1# -> f arr off s0
@ -223,8 +226,8 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0
                    then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
                    else do
                      write2 dst doff '\\' 'u'
-                      doff' <- Unsafe.pasteST
+                      doff' <- Bounded.pasteST
-                        (Unsafe.word16PaddedUpperHex (fromIntegral (c2w c)))
+                        (Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
                        dst (doff + 2)
                      go (soff + 1) (slen - 1) doff'
                else pure doff
@ -262,61 +265,70 @@ shortTextJsonString a =
 -- This encoding never starts with a zero unless the
 -- argument was zero.
 word64Dec :: Word64 -> Builder
-word64Dec w = fromUnsafe (Unsafe.word64Dec w)
+word64Dec w = fromBounded (Bounded.word64Dec w)
 -- | Encodes an unsigned 16-bit integer as decimal.
 -- This encoding never starts with a zero unless the
 -- argument was zero.
 word32Dec :: Word32 -> Builder
-word32Dec w = fromUnsafe (Unsafe.word32Dec w)
+word32Dec w = fromBounded (Bounded.word32Dec w)
 -- | Encodes an unsigned 16-bit integer as decimal.
 -- This encoding never starts with a zero unless the
 -- argument was zero.
 word16Dec :: Word16 -> Builder
-word16Dec w = fromUnsafe (Unsafe.word16Dec w)
+word16Dec w = fromBounded (Bounded.word16Dec w)
 -- | Encode a double-floating-point number, using decimal notation or
 -- scientific notation depending on the magnitude. This has undefined
 -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
 -- crash, but the generated numbers will be nonsense.
 doubleDec :: Double -> Builder
-doubleDec w = fromUnsafe (Unsafe.doubleDec w)
+doubleDec w = fromBounded (Bounded.doubleDec w)
 -- | Encodes a signed 64-bit integer as decimal.
 -- This encoding never starts with a zero unless the argument was zero.
 -- Negative numbers are preceded by a minus sign. Positive numbers
 -- are not preceded by anything.
 int64Dec :: Int64 -> Builder
-int64Dec w = fromUnsafe (Unsafe.int64Dec w)
+int64Dec w = fromBounded (Bounded.int64Dec w)
 -- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding
 -- the encoding to 16 digits. This uses uppercase for the alphabetical
 -- digits. For example, this encodes the number 1022 as @00000000000003FE@.
 word64PaddedUpperHex :: Word64 -> Builder
 word64PaddedUpperHex w =
-  fromUnsafe (Unsafe.word64PaddedUpperHex w)
+  fromBounded (Bounded.word64PaddedUpperHex w)
 -- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding
 -- the encoding to 8 digits. This uses uppercase for the alphabetical
 -- digits. For example, this encodes the number 1022 as @000003FE@.
 word32PaddedUpperHex :: Word32 -> Builder
 word32PaddedUpperHex w =
-  fromUnsafe (Unsafe.word32PaddedUpperHex w)
+  fromBounded (Bounded.word32PaddedUpperHex w)
 -- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding
 -- the encoding to 4 digits. This uses uppercase for the alphabetical
 -- digits. For example, this encodes the number 1022 as @03FE@.
 word16PaddedUpperHex :: Word16 -> Builder
 word16PaddedUpperHex w =
-  fromUnsafe (Unsafe.word16PaddedUpperHex w)
+  fromBounded (Bounded.word16PaddedUpperHex w)
 -- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding
 -- the encoding to 2 digits. This uses uppercase for the alphabetical
 -- digits. For example, this encodes the number 11 as @0B@.
 word8PaddedUpperHex :: Word8 -> Builder
 word8PaddedUpperHex w =
-  fromUnsafe (Unsafe.word8PaddedUpperHex w)
+  fromBounded (Bounded.word8PaddedUpperHex w)
 -- | Encode an ASCII char.
 -- Precondition: Input must be an ASCII character. This is not checked.
 ascii :: Char -> Builder
 ascii c = fromBounded (Bounded.char c)
 -- | Encode an UTF8 char. This only uses as much space as is required.
 char :: Char -> Builder
 char c = fromBounded (Bounded.char c)
 unST :: ST s a -> State# s -> (# State# s, a #)
 unST (ST f) = f
@ -328,17 +340,20 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
 -- | Requires exactly 8 bytes. Dump the octets of a 64-bit
 -- word in a big-endian fashion.
 word64BE :: Word64 -> Builder
-word64BE w = fromUnsafe (Unsafe.word64BE w)
+word64BE w = fromBounded (Bounded.word64BE w)
 -- | Requires exactly 4 bytes. Dump the octets of a 32-bit
 -- word in a big-endian fashion.
 word32BE :: Word32 -> Builder
-word32BE w = fromUnsafe (Unsafe.word32BE w)
+word32BE w = fromBounded (Bounded.word32BE w)
 -- | Requires exactly 2 bytes. Dump the octets of a 16-bit
 -- word in a big-endian fashion.
 word16BE :: Word16 -> Builder
-word16BE w = fromUnsafe (Unsafe.word16BE w)
+word16BE w = fromBounded (Bounded.word16BE w)
 word8 :: Word8 -> Builder
 word8 w = fromBounded (Bounded.word8 w)
 -- ShortText is already UTF-8 encoded. This is a no-op.
 shortTextToByteArray :: ShortText -> ByteArray
--- a/src/Data/ByteArray/Builder/Small/Bounded.hs
+++ b/src/Data/ByteArray/Builder/Small/Bounded.hs
@ -1,8 +1,9 @@
-{-# language GADTSyntax #-}
+{-# language GADTs #-}
 {-# language KindSignatures #-}
 {-# language ScopedTypeVariables #-}
 {-# language BangPatterns #-}
 {-# language MagicHash #-}
 {-# language BinaryLiterals #-}
 {-# language UnboxedTuples #-}
 {-# language RankNTypes #-}
 {-# language LambdaCase #-}
@ -10,9 +11,8 @@
 {-# language DataKinds #-}
 {-# language TypeApplications #-}
-- | The functions in this module do not check to
+-- | The functions in this module are explict in the amount of bytes they require.
-- see if there is enough space in the buffer.
+module Data.ByteArray.Builder.Small.Bounded
 module Data.ByteArray.Builder.Small.Unsafe
  ( -- * Builder
    Builder(..)
  , construct
@ -23,6 +23,11 @@ module Data.ByteArray.Builder.Small.Unsafe
  , pasteIO
    -- * Combine
  , append
    -- * Bounds Manipulation
  , (<=)
  , lessThanEqual
  , isLessThanEqual
  , raise
    -- * Encode Integral Types
    -- ** Human-Readable
  , word64Dec
@ -33,6 +38,8 @@ module Data.ByteArray.Builder.Small.Unsafe
  , word32PaddedUpperHex
  , word16PaddedUpperHex
  , word8PaddedUpperHex
  , ascii
  , char
    -- ** Machine-Readable
  , word64BE
  , word32BE
@ -53,8 +60,9 @@ import GHC.Word
 import GHC.Int
 import Data.Kind
 import GHC.TypeLits (KnownNat,Nat,type (+),natVal')
 import qualified GHC.TypeLits as GHC
 import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
-import Control.Monad (when)
+import qualified Control.Category as Cat
 import qualified Data.Primitive as PM
@ -65,13 +73,16 @@ newtype Builder :: Nat -> Type where
        (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #))
     -> Builder n
 knownNat :: KnownNat n => Proxy# n -> Int
 knownNat p = fromIntegral (natVal' p)
 -- | Execute the builder. This function is safe.
 run :: forall n. KnownNat n
  => Builder n -- ^ Builder
  -> ByteArray
 {-# inline run #-}
 run b = runST $ do
-  arr <- newByteArray (fromIntegral (natVal' (proxy# :: Proxy# n)))
+  arr <- newByteArray (knownNat (proxy# :: Proxy# n))
  len <- pasteST b arr 0
  shrinkMutableByteArray arr len
  unsafeFreezeByteArray arr
@ -96,7 +107,7 @@ pasteGrowST :: forall n s. KnownNat n
 {-# inline pasteGrowST #-}
 pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do
  sz0 <- PM.getSizeofMutableByteArray arr0
-  let req = fromIntegral (natVal' (proxy# :: Proxy# n))
+  let req = knownNat (proxy# :: Proxy# n)
  let sz1 = off0 + req
  if sz1 <= sz0
    then do
@ -131,6 +142,33 @@ append (Builder f) (Builder g) =
  Builder $ \arr off0 s0 -> case f arr off0 s0 of
    (# s1, r #) -> g arr r s1
 -- | A proof that n is less than or equal to m
 newtype (n :: Nat) <= (m :: Nat) = LessThanEqual Int -- m - n
 instance Cat.Category (<=) where
  id = LessThanEqual 0
  -- b <= c (c - b) -> a <= b (b - a) -> a <= c (c - a)
  LessThanEqual cb . LessThanEqual ba = LessThanEqual (cb + ba)
 -- | Dynamically check than 'n' is less than or equal to 'm'
 isLessThanEqual :: (KnownNat n, KnownNat m) => Proxy# n -> Proxy# m -> Maybe (n <= m)
 isLessThanEqual n m = if 0 <= diff then Just (LessThanEqual diff) else Nothing
  where diff = knownNat m - knownNat n
 -- | Statically check than 'n' is less than or equal to 'm'. 'n' and 'm' must be known at compile time.
 lessThanEqual :: forall n m. (KnownNat n, KnownNat m, n GHC.<= m) => n <= m
 lessThanEqual = LessThanEqual (knownNat (proxy# :: Proxy# m) - knownNat (proxy# :: Proxy# n))
 -- | Weaken the bound on the maximum number of bytes required.
 -- >>> :{
 -- buildNumber :: Either Double Word64 -> Builder 32
 -- buildNumber = \case
 --   Left d  -> doubleDec d
 --   Right w -> raise lessThanEqual (word64Dec w)
 -- :}
 raise :: forall m n. n <= m -> Builder n -> Builder m
 raise !_ (Builder f) = Builder f
 -- | Encode a double-floating-point number, using decimal notation or
 -- scientific notation depending on the magnitude. This has undefined
 -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
@ -302,6 +340,76 @@ word8PaddedUpperHex# w# = construct $ \arr off -> do
  where
  w = W# w#
 -- | Encode an ASCII char.
 -- Precondition: Input must be an ASCII character. This is not checked.
 ascii :: Char -> Builder 1
 ascii c = word8 (fromIntegral @Int @Word8 (ord c))
 -- | Encode an UTF8 char. This only uses as much space as is required.
 char :: Char -> Builder 4
 char c
  | codepoint < 0x80 = construct $ \arr off -> do
      writeByteArray arr off (unsafeWordToWord8 codepoint)
      pure (off + 1)
  | codepoint < 0x800 = construct $ \arr off -> do
      writeByteArray arr off       (unsafeWordToWord8 (byteTwoOne codepoint))
      writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint))
      return (off + 2)
  | codepoint >= 0xD800 && codepoint < 0xE000 = construct $ \arr off -> do
      -- Codepoint U+FFFD
      writeByteArray arr off       (0xEF :: Word8)
      writeByteArray arr (off + 1) (0xBF :: Word8)
      writeByteArray arr (off + 2) (0xBD :: Word8)
      return (off + 3)
  | codepoint < 0x10000 = construct $ \arr off -> do
      writeByteArray arr off       (unsafeWordToWord8 (byteThreeOne codepoint))
      writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint))
      writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint))
      return (off + 3)
  | otherwise = construct $ \arr off -> do
      writeByteArray arr off       (unsafeWordToWord8 (byteFourOne codepoint))
      writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint))
      writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint))
      writeByteArray arr (off + 3) (unsafeWordToWord8 (byteFourFour codepoint))
      return (off + 4)
  where
    codepoint :: Word
    codepoint = fromIntegral (ord c)
    unsafeWordToWord8 :: Word -> Word8
    unsafeWordToWord8 (W# w) = W8# w
    -- precondition: codepoint is less than 0x800
    byteTwoOne :: Word -> Word
    byteTwoOne w = unsafeShiftR w 6 .|. 0b11000000
    byteTwoTwo :: Word -> Word
    byteTwoTwo w = (w .&. 0b00111111) .|. 0b10000000
    -- precondition: codepoint is less than 0x1000
    byteThreeOne :: Word -> Word
    byteThreeOne w = unsafeShiftR w 12 .|. 0b11100000
    byteThreeTwo :: Word -> Word
    byteThreeTwo w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
    byteThreeThree :: Word -> Word
    byteThreeThree w = (w .&. 0b00111111) .|. 0b10000000
    -- precondition: codepoint is less than 0x110000
    byteFourOne :: Word -> Word
    byteFourOne w = unsafeShiftR w 18 .|. 0b11110000
    byteFourTwo :: Word -> Word
    byteFourTwo w = (0b00111111 .&. unsafeShiftR w 12) .|. 0b10000000
    byteFourThree :: Word -> Word
    byteFourThree w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
    byteFourFour :: Word -> Word
    byteFourFour w = (0b00111111 .&. w) .|. 0b10000000
 -- | Requires exactly 8 bytes. Dump the octets of a 64-bit
 -- word in a big-endian fashion.
 word64BE :: Word64 -> Builder 8