switch unsafe to bounded + char encoding

2019-09-01 22:43:54 -07:00 · 2019-09-01 22:43:54 -07:00 · 9a14600a38
commit 9a14600a38
parent 22dfde2936
3 changed files with 152 additions and 29 deletions
--- a/small-bytearray-builder.cabal
+++ b/small-bytearray-builder.cabal
@ -37,7 +37,7 @@ flag checked
 library
  exposed-modules:
    Data.ByteArray.Builder.Small
-    Data.ByteArray.Builder.Small.Unsafe
+    Data.ByteArray.Builder.Small.Bounded
  build-depends:
    , base >=4.12.0.0 && <5
    , byteslice >=0.1 && <0.2
--- a/src/Data/ByteArray/Builder/Small.hs
+++ b/src/Data/ByteArray/Builder/Small.hs
@ -7,10 +7,10 @@
 {-# language UnboxedTuples #-}

 module Data.ByteArray.Builder.Small
-  ( -- * Unsafe Primitives
+  ( -- * Bounded Primitives
    Builder(..)
  , construct
-  , fromUnsafe
+  , fromBounded
    -- * Evaluation
  , run
  , pasteST
@ -34,10 +34,13 @@ module Data.ByteArray.Builder.Small
  , word32PaddedUpperHex
  , word16PaddedUpperHex
  , word8PaddedUpperHex
+  , ascii
+  , char
    -- ** Machine-Readable
  , word64BE
  , word32BE
  , word16BE
+  , word8
    -- * Encode Floating-Point Types
    -- ** Human-Readable
  , doubleDec
@ -62,7 +65,7 @@ import qualified GHC.Exts as Exts
 import qualified Data.Text.Short as TS
 import qualified Data.Primitive as PM
 import qualified Data.Vector as V
-import qualified Data.ByteArray.Builder.Small.Unsafe as Unsafe
+import qualified Data.ByteArray.Builder.Small.Bounded as Bounded

 -- | An unmaterialized sequence of bytes that may be pasted
 -- into a mutable byte array.
@ -183,9 +186,9 @@ construct f = Builder
        Nothing -> (# s1, (-1#) #)
        Just (I# n) -> (# s1, n #)

-fromUnsafe :: forall n. KnownNat n => Unsafe.Builder n -> Builder
-{-# inline fromUnsafe #-}
-fromUnsafe (Unsafe.Builder f) = Builder $ \arr off len s0 ->
+fromBounded :: forall n. KnownNat n => Bounded.Builder n -> Builder
+{-# inline fromBounded #-}
+fromBounded (Bounded.Builder f) = Builder $ \arr off len s0 ->
  case fromIntegral (natVal' (proxy# :: Proxy# n)) of
    I# req -> case len >=# req of
      1# -> f arr off s0
@ -223,8 +226,8 @@ slicedUtf8TextJson !src# !soff0# !slen0# = construct $ \(MutableBytes dst doff0
                    then PM.writeByteArray dst doff (c2w c) *> go (soff + 1) (slen - 1) (doff + 1)
                    else do
                      write2 dst doff '\\' 'u'
-                      doff' <- Unsafe.pasteST
-                        (Unsafe.word16PaddedUpperHex (fromIntegral (c2w c)))
+                      doff' <- Bounded.pasteST
+                        (Bounded.word16PaddedUpperHex (fromIntegral (c2w c)))
                        dst (doff + 2)
                      go (soff + 1) (slen - 1) doff'
                else pure doff
@ -262,61 +265,70 @@ shortTextJsonString a =
 -- This encoding never starts with a zero unless the
 -- argument was zero.
 word64Dec :: Word64 -> Builder
-word64Dec w = fromUnsafe (Unsafe.word64Dec w)
+word64Dec w = fromBounded (Bounded.word64Dec w)

 -- | Encodes an unsigned 16-bit integer as decimal.
 -- This encoding never starts with a zero unless the
 -- argument was zero.
 word32Dec :: Word32 -> Builder
-word32Dec w = fromUnsafe (Unsafe.word32Dec w)
+word32Dec w = fromBounded (Bounded.word32Dec w)

 -- | Encodes an unsigned 16-bit integer as decimal.
 -- This encoding never starts with a zero unless the
 -- argument was zero.
 word16Dec :: Word16 -> Builder
-word16Dec w = fromUnsafe (Unsafe.word16Dec w)
+word16Dec w = fromBounded (Bounded.word16Dec w)

 -- | Encode a double-floating-point number, using decimal notation or
 -- scientific notation depending on the magnitude. This has undefined
 -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
 -- crash, but the generated numbers will be nonsense.
 doubleDec :: Double -> Builder
-doubleDec w = fromUnsafe (Unsafe.doubleDec w)
+doubleDec w = fromBounded (Bounded.doubleDec w)

 -- | Encodes a signed 64-bit integer as decimal.
 -- This encoding never starts with a zero unless the argument was zero.
 -- Negative numbers are preceded by a minus sign. Positive numbers
 -- are not preceded by anything.
 int64Dec :: Int64 -> Builder
-int64Dec w = fromUnsafe (Unsafe.int64Dec w)
+int64Dec w = fromBounded (Bounded.int64Dec w)

 -- | Encode a 64-bit unsigned integer as hexadecimal, zero-padding
 -- the encoding to 16 digits. This uses uppercase for the alphabetical
 -- digits. For example, this encodes the number 1022 as @00000000000003FE@.
 word64PaddedUpperHex :: Word64 -> Builder
 word64PaddedUpperHex w =
-  fromUnsafe (Unsafe.word64PaddedUpperHex w)
+  fromBounded (Bounded.word64PaddedUpperHex w)

 -- | Encode a 32-bit unsigned integer as hexadecimal, zero-padding
 -- the encoding to 8 digits. This uses uppercase for the alphabetical
 -- digits. For example, this encodes the number 1022 as @000003FE@.
 word32PaddedUpperHex :: Word32 -> Builder
 word32PaddedUpperHex w =
-  fromUnsafe (Unsafe.word32PaddedUpperHex w)
+  fromBounded (Bounded.word32PaddedUpperHex w)

 -- | Encode a 16-bit unsigned integer as hexadecimal, zero-padding
 -- the encoding to 4 digits. This uses uppercase for the alphabetical
 -- digits. For example, this encodes the number 1022 as @03FE@.
 word16PaddedUpperHex :: Word16 -> Builder
 word16PaddedUpperHex w =
-  fromUnsafe (Unsafe.word16PaddedUpperHex w)
+  fromBounded (Bounded.word16PaddedUpperHex w)

 -- | Encode a 8-bit unsigned integer as hexadecimal, zero-padding
 -- the encoding to 2 digits. This uses uppercase for the alphabetical
 -- digits. For example, this encodes the number 11 as @0B@.
 word8PaddedUpperHex :: Word8 -> Builder
 word8PaddedUpperHex w =
-  fromUnsafe (Unsafe.word8PaddedUpperHex w)
+  fromBounded (Bounded.word8PaddedUpperHex w)
+
+-- | Encode an ASCII char.
+-- Precondition: Input must be an ASCII character. This is not checked.
+ascii :: Char -> Builder
+ascii c = fromBounded (Bounded.char c)
+
+-- | Encode an UTF8 char. This only uses as much space as is required.
+char :: Char -> Builder
+char c = fromBounded (Bounded.char c)

 unST :: ST s a -> State# s -> (# State# s, a #)
 unST (ST f) = f
@ -328,17 +340,20 @@ shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
 -- | Requires exactly 8 bytes. Dump the octets of a 64-bit
 -- word in a big-endian fashion.
 word64BE :: Word64 -> Builder
-word64BE w = fromUnsafe (Unsafe.word64BE w)
+word64BE w = fromBounded (Bounded.word64BE w)

 -- | Requires exactly 4 bytes. Dump the octets of a 32-bit
 -- word in a big-endian fashion.
 word32BE :: Word32 -> Builder
-word32BE w = fromUnsafe (Unsafe.word32BE w)
+word32BE w = fromBounded (Bounded.word32BE w)

 -- | Requires exactly 2 bytes. Dump the octets of a 16-bit
 -- word in a big-endian fashion.
 word16BE :: Word16 -> Builder
-word16BE w = fromUnsafe (Unsafe.word16BE w)
+word16BE w = fromBounded (Bounded.word16BE w)
+
+word8 :: Word8 -> Builder
+word8 w = fromBounded (Bounded.word8 w)

 -- ShortText is already UTF-8 encoded. This is a no-op.
 shortTextToByteArray :: ShortText -> ByteArray
--- a/src/Data/ByteArray/Builder/Small/Bounded.hs
+++ b/src/Data/ByteArray/Builder/Small/Bounded.hs
@ -1,8 +1,9 @@
-{-# language GADTSyntax #-}
+{-# language GADTs #-}
 {-# language KindSignatures #-}
 {-# language ScopedTypeVariables #-}
 {-# language BangPatterns #-}
 {-# language MagicHash #-}
+{-# language BinaryLiterals #-}
 {-# language UnboxedTuples #-}
 {-# language RankNTypes #-}
 {-# language LambdaCase #-}
@ -10,9 +11,8 @@
 {-# language DataKinds #-}
 {-# language TypeApplications #-}

-- | The functions in this module do not check to
-- see if there is enough space in the buffer.
-module Data.ByteArray.Builder.Small.Unsafe
+-- | The functions in this module are explict in the amount of bytes they require.
+module Data.ByteArray.Builder.Small.Bounded
  ( -- * Builder
    Builder(..)
  , construct
@ -23,6 +23,11 @@ module Data.ByteArray.Builder.Small.Unsafe
  , pasteIO
    -- * Combine
  , append
+    -- * Bounds Manipulation
+  , (<=)
+  , lessThanEqual
+  , isLessThanEqual
+  , raise
    -- * Encode Integral Types
    -- ** Human-Readable
  , word64Dec
@ -33,6 +38,8 @@ module Data.ByteArray.Builder.Small.Unsafe
  , word32PaddedUpperHex
  , word16PaddedUpperHex
  , word8PaddedUpperHex
+  , ascii
+  , char
    -- ** Machine-Readable
  , word64BE
  , word32BE
@ -53,8 +60,9 @@ import GHC.Word
 import GHC.Int
 import Data.Kind
 import GHC.TypeLits (KnownNat,Nat,type (+),natVal')
+import qualified GHC.TypeLits as GHC
 import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
-import Control.Monad (when)
+import qualified Control.Category as Cat

 import qualified Data.Primitive as PM

@ -65,13 +73,16 @@ newtype Builder :: Nat -> Type where
        (forall s. MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #))
     -> Builder n

+knownNat :: KnownNat n => Proxy# n -> Int
+knownNat p = fromIntegral (natVal' p)
+
 -- | Execute the builder. This function is safe.
 run :: forall n. KnownNat n
  => Builder n -- ^ Builder
  -> ByteArray
 {-# inline run #-}
 run b = runST $ do
-  arr <- newByteArray (fromIntegral (natVal' (proxy# :: Proxy# n)))
+  arr <- newByteArray (knownNat (proxy# :: Proxy# n))
  len <- pasteST b arr 0
  shrinkMutableByteArray arr len
  unsafeFreezeByteArray arr
@ -96,7 +107,7 @@ pasteGrowST :: forall n s. KnownNat n
 {-# inline pasteGrowST #-}
 pasteGrowST b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do
  sz0 <- PM.getSizeofMutableByteArray arr0
-  let req = fromIntegral (natVal' (proxy# :: Proxy# n))
+  let req = knownNat (proxy# :: Proxy# n)
  let sz1 = off0 + req
  if sz1 <= sz0
    then do
@ -131,6 +142,33 @@ append (Builder f) (Builder g) =
  Builder $ \arr off0 s0 -> case f arr off0 s0 of
    (# s1, r #) -> g arr r s1

+-- | A proof that n is less than or equal to m
+newtype (n :: Nat) <= (m :: Nat) = LessThanEqual Int -- m - n
+
+instance Cat.Category (<=) where
+  id = LessThanEqual 0
+  -- b <= c (c - b) -> a <= b (b - a) -> a <= c (c - a)
+  LessThanEqual cb . LessThanEqual ba = LessThanEqual (cb + ba)
+
+-- | Dynamically check than 'n' is less than or equal to 'm'
+isLessThanEqual :: (KnownNat n, KnownNat m) => Proxy# n -> Proxy# m -> Maybe (n <= m)
+isLessThanEqual n m = if 0 <= diff then Just (LessThanEqual diff) else Nothing
+  where diff = knownNat m - knownNat n
+
+-- | Statically check than 'n' is less than or equal to 'm'. 'n' and 'm' must be known at compile time.
+lessThanEqual :: forall n m. (KnownNat n, KnownNat m, n GHC.<= m) => n <= m
+lessThanEqual = LessThanEqual (knownNat (proxy# :: Proxy# m) - knownNat (proxy# :: Proxy# n))
+
+-- | Weaken the bound on the maximum number of bytes required.
+-- >>> :{
+-- buildNumber :: Either Double Word64 -> Builder 32
+-- buildNumber = \case
+--   Left d  -> doubleDec d
+--   Right w -> raise lessThanEqual (word64Dec w)
+-- :}
+raise :: forall m n. n <= m -> Builder n -> Builder m
+raise !_ (Builder f) = Builder f
+
 -- | Encode a double-floating-point number, using decimal notation or
 -- scientific notation depending on the magnitude. This has undefined
 -- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
@ -302,6 +340,76 @@ word8PaddedUpperHex# w# = construct $ \arr off -> do
  where
  w = W# w#

+-- | Encode an ASCII char.
+-- Precondition: Input must be an ASCII character. This is not checked.
+ascii :: Char -> Builder 1
+ascii c = word8 (fromIntegral @Int @Word8 (ord c))
+
+-- | Encode an UTF8 char. This only uses as much space as is required.
+char :: Char -> Builder 4
+char c
+  | codepoint < 0x80 = construct $ \arr off -> do
+      writeByteArray arr off (unsafeWordToWord8 codepoint)
+      pure (off + 1)
+  | codepoint < 0x800 = construct $ \arr off -> do
+      writeByteArray arr off       (unsafeWordToWord8 (byteTwoOne codepoint))
+      writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint))
+      return (off + 2)
+  | codepoint >= 0xD800 && codepoint < 0xE000 = construct $ \arr off -> do
+      -- Codepoint U+FFFD
+      writeByteArray arr off       (0xEF :: Word8)
+      writeByteArray arr (off + 1) (0xBF :: Word8)
+      writeByteArray arr (off + 2) (0xBD :: Word8)
+      return (off + 3)
+  | codepoint < 0x10000 = construct $ \arr off -> do
+      writeByteArray arr off       (unsafeWordToWord8 (byteThreeOne codepoint))
+      writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint))
+      writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint))
+      return (off + 3)
+  | otherwise = construct $ \arr off -> do
+      writeByteArray arr off       (unsafeWordToWord8 (byteFourOne codepoint))
+      writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint))
+      writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint))
+      writeByteArray arr (off + 3) (unsafeWordToWord8 (byteFourFour codepoint))
+      return (off + 4)
+
+  where
+    codepoint :: Word
+    codepoint = fromIntegral (ord c)
+
+    unsafeWordToWord8 :: Word -> Word8
+    unsafeWordToWord8 (W# w) = W8# w
+
+    -- precondition: codepoint is less than 0x800
+    byteTwoOne :: Word -> Word
+    byteTwoOne w = unsafeShiftR w 6 .|. 0b11000000
+
+    byteTwoTwo :: Word -> Word
+    byteTwoTwo w = (w .&. 0b00111111) .|. 0b10000000
+
+    -- precondition: codepoint is less than 0x1000
+    byteThreeOne :: Word -> Word
+    byteThreeOne w = unsafeShiftR w 12 .|. 0b11100000
+
+    byteThreeTwo :: Word -> Word
+    byteThreeTwo w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
+
+    byteThreeThree :: Word -> Word
+    byteThreeThree w = (w .&. 0b00111111) .|. 0b10000000
+
+    -- precondition: codepoint is less than 0x110000
+    byteFourOne :: Word -> Word
+    byteFourOne w = unsafeShiftR w 18 .|. 0b11110000
+
+    byteFourTwo :: Word -> Word
+    byteFourTwo w = (0b00111111 .&. unsafeShiftR w 12) .|. 0b10000000
+
+    byteFourThree :: Word -> Word
+    byteFourThree w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
+
+    byteFourFour :: Word -> Word
+    byteFourFour w = (0b00111111 .&. w) .|. 0b10000000
+
 -- | Requires exactly 8 bytes. Dump the octets of a 64-bit
 -- word in a big-endian fashion.
 word64BE :: Word64 -> Builder 8