bytebuild/src/Data/ByteArray/Builder/Bounded.hs
2019-12-30 20:43:36 -05:00

858 lines
31 KiB
Haskell

{-# language BangPatterns #-}
{-# language BinaryLiterals #-}
{-# language DataKinds #-}
{-# language KindSignatures #-}
{-# language LambdaCase #-}
{-# language MagicHash #-}
{-# language RankNTypes #-}
{-# language ScopedTypeVariables #-}
{-# language TypeApplications #-}
{-# language TypeOperators #-}
{-# language UnboxedTuples #-}
-- | The functions in this module are explict about the maximum number
-- of bytes they require.
module Data.ByteArray.Builder.Bounded
( -- * Builder
Builder
-- * Execute
, run
, pasteGrowST
-- * Combine
, empty
, append
-- * Bounds Manipulation
, weaken
, substitute
-- * Encode Integral Types
-- ** Human-Readable
, word64Dec
, word32Dec
, word16Dec
, word8Dec
, wordDec
, int64Dec
, int32Dec
, int16Dec
, int8Dec
, intDec
-- * Unsigned Words
-- ** 64-bit
, word64PaddedUpperHex
-- ** 32-bit
, word32PaddedUpperHex
-- ** 16-bit
, word16PaddedLowerHex
, word16PaddedUpperHex
, word16LowerHex
, word16UpperHex
-- ** 8-bit
, word8PaddedUpperHex
, word8LowerHex
, ascii
, char
-- ** Native
, wordPaddedDec2
, wordPaddedDec9
-- ** Machine-Readable
-- *** One
, word8
-- **** Big Endian
, word128BE
, word64BE
, word32BE
, word16BE
, int64BE
, int32BE
, int16BE
-- **** Little Endian
, word128LE
, word64LE
, word32LE
, word16LE
, int64LE
, int32LE
, int16LE
-- * Encode Floating-Point Types
, doubleDec
) where
import Arithmetic.Types (type (<=), type (:=:))
import Control.Monad.Primitive
import Control.Monad.ST (ST)
import Control.Monad.ST.Run (runByteArrayST)
import Data.Bits
import Data.ByteArray.Builder.Bounded.Unsafe (Builder(..))
import Data.Char (ord)
import Data.Primitive
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
import Data.WideWord (Word128(Word128))
import GHC.Exts
import GHC.Int (Int64(I64#),Int32(I32#),Int16(I16#),Int8(I8#))
import GHC.ST (ST(ST))
import GHC.TypeLits (type (+))
import GHC.Word (Word8(W8#),Word16(W16#),Word32(W32#),Word64(W64#))
import qualified Arithmetic.Lte as Lte
import qualified Arithmetic.Nat as Nat
import qualified Arithmetic.Types as Arithmetic
import qualified Data.ByteArray.Builder.Bounded.Unsafe as Unsafe
import qualified Data.Primitive as PM
-- | Execute the bounded builder. If the size is a constant,
-- use @Arithmetic.Nat.constant@ as the first argument to let
-- GHC conjure up this value for you.
run ::
Arithmetic.Nat n
-> Builder n -- ^ Builder
-> ByteArray
{-# inline run #-}
run n b = runByteArrayST $ do
arr <- newByteArray (Nat.demote n)
len <- Unsafe.pasteST b arr 0
shrinkMutableByteArray arr len
unsafeFreezeByteArray arr
-- | Paste the builder into the byte array starting at offset zero.
-- This reallocates the byte array if it cannot accomodate the builder,
-- growing it by the minimum amount necessary.
pasteGrowST ::
Arithmetic.Nat n
-> Builder n
-> MutableByteArrayOffset s
-- ^ Initial buffer, used linearly. Do not reuse this argument.
-> ST s (MutableByteArrayOffset s)
-- ^ Final buffer that accomodated the builder.
{-# inline pasteGrowST #-}
pasteGrowST n b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do
sz0 <- PM.getSizeofMutableByteArray arr0
let req = Nat.demote n
let sz1 = off0 + req
if sz1 <= sz0
then do
off1 <- Unsafe.pasteST b arr0 off0
pure (MutableByteArrayOffset arr0 off1)
else do
arr1 <- PM.resizeMutableByteArray arr0 sz1
off1 <- Unsafe.pasteST b arr1 off0
pure (MutableByteArrayOffset arr1 off1)
-- | The monoidal unit of `append`
empty :: Builder 0
empty = Builder $ \_ off0 s0 -> (# s0, off0 #)
infixr 9 `append`
-- | Concatenate two builders.
append :: Builder m -> Builder n -> Builder (m + n)
append (Builder f) (Builder g) =
Builder $ \arr off0 s0 -> case f arr off0 s0 of
(# s1, r #) -> g arr r s1
-- | Weaken the bound on the maximum number of bytes required. For example,
-- to use two builders with unequal bounds in a disjunctive setting:
--
-- > import qualified Arithmetic.Lte as Lte
-- >
-- > buildNumber :: Either Double Word64 -> Builder 32
-- > buildNumber = \case
-- > Left d -> doubleDec d
-- > Right w -> weaken (Lte.constant @19 @32) (word64Dec w)
weaken :: forall m n. (m <= n) -> Builder m -> Builder n
weaken !_ (Builder f) = Builder f
-- | Replace the upper bound on size with an equal number.
substitute :: forall m n. (m :=: n) -> Builder m -> Builder n
substitute !_ (Builder f) = Builder f
-- | Encode a double-floating-point number, using decimal notation or
-- scientific notation depending on the magnitude. This has undefined
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
-- crash, but the generated numbers will be nonsense.
doubleDec :: Double -> Builder 32
doubleDec (D# d) = Builder (\arr off0 s0 -> doubleDec# d arr off0 s0)
-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word64Dec :: Word64 -> Builder 19
word64Dec (W64# w) = wordCommonDec# w
-- | Requires up to 10 bytes. Encodes an unsigned 32-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word32Dec :: Word32 -> Builder 10
word32Dec (W32# w) = wordCommonDec# w
-- | Requires up to 5 bytes. Encodes an unsigned 16-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word16Dec :: Word16 -> Builder 5
word16Dec (W16# w) = wordCommonDec# w
-- | Requires up to 3 bytes. Encodes an unsigned 8-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word8Dec :: Word8 -> Builder 3
word8Dec (W8# w) =
-- We unroll the loop when encoding Word8s. This speeds things
-- up IPv4 encoding by about 10% in the @ip@ library. We can
-- encode Word8s at twice this speed by using a lookup table.
-- However, I (Andrew Martin) am concerned that although lookup
-- table perform very well in microbenchmarks, they can thrash
-- L1 cache in real applications.
word8Dec# w
-- | Requires up to 19 bytes. Encodes an unsigned machine-sized integer
-- as decimal. This encoding never starts with a zero unless the argument
-- was zero.
wordDec :: Word -> Builder 19
wordDec (W# w) = wordCommonDec# w
-- | Requires up to 20 bytes. Encodes a signed 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int64Dec :: Int64 -> Builder 20
int64Dec (I64# w) = intCommonDec# w
-- | Requires up to 11 bytes. Encodes a signed 32-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int32Dec :: Int32 -> Builder 11
int32Dec (I32# w) = intCommonDec# w
-- | Requires up to 6 bytes. Encodes a signed 16-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int16Dec :: Int16 -> Builder 6
int16Dec (I16# w) = intCommonDec# w
-- | Requires up to 4 bytes. Encodes a signed 8-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int8Dec :: Int8 -> Builder 4
int8Dec (I8# w) = intCommonDec# w
-- | Requires up to 20 bytes. Encodes a signed machine-sized integer
-- as decimal. This encoding never starts with a zero unless the
-- argument was zero. Negative numbers are preceded by a minus sign.
-- Positive numbers are not preceded by anything.
intDec :: Int -> Builder 20
intDec (I# w) = intCommonDec# w
word8Dec# :: Word# -> Builder 3
{-# noinline word8Dec# #-}
word8Dec# w# = Unsafe.construct $ \arr off0 -> do
let !(I# off0# ) = off0
!(!x,!ones) = quotRem w 10
!(hundreds@(W# hundreds# ),tens@(W# tens# )) = quotRem x 10
writeByteArray arr off0 (fromIntegral (hundreds + 0x30) :: Word8)
let !hasHundreds = gtWord# hundreds# 0##
!off1@(I# off1# ) = I# (off0# +# hasHundreds)
writeByteArray arr off1 (fromIntegral (tens + 0x30) :: Word8)
let !off2 = I# (off1# +# (orI# hasHundreds (gtWord# tens# 0## )))
writeByteArray arr off2 (fromIntegral (ones + 0x30) :: Word8)
pure (off2 + 1)
where
w = W# w#
-- Requires a number of bytes that is bounded by the size of
-- the word. This is only used internally.
wordCommonDec# :: Word# -> Builder n
{-# noinline wordCommonDec# #-}
wordCommonDec# w# = Unsafe.construct $ \arr off0 -> if w /= 0
then internalWordLoop arr off0 (W# w#)
else do
writeByteArray arr off0 (c2w '0')
pure (off0 + 1)
where
w = W64# w#
internalWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
{-# inline internalWordLoop #-}
internalWordLoop arr off0 x0 = go off0 x0 where
go !off !(x :: Word) = if x > 0
then do
let (y,z) = quotRem x 10
writeByteArray arr off (fromIntegral (z + 0x30) :: Word8)
go (off + 1) y
else do
reverseBytes arr off0 (off - 1)
pure off
-- Requires up to 20 bytes. Can be less depending on what the
-- size of the argument is known to be. Unsafe.
intCommonDec# :: Int# -> Builder n
{-# noinline intCommonDec# #-}
intCommonDec# w# = Unsafe.construct $ \arr off0 -> case compare w 0 of
GT -> internalWordLoop arr off0 (fromIntegral w)
EQ -> do
writeByteArray arr off0 (c2w '0')
pure (off0 + 1)
LT -> do
writeByteArray arr off0 (c2w '-')
internalWordLoop arr (off0 + 1) (fromIntegral (negate w))
where
w = I64# w#
-- Convert a number between 0 and 16 to the ASCII
-- representation of its hexadecimal character.
-- The use of fromIntegral causes us to incur an
-- unneeded bitmask. This actually needs a Word64
-- argument.
toHexUpper :: Word -> Word8
toHexUpper w' = fromIntegral
$ (complement theMask .&. loSolved)
.|. (theMask .&. hiSolved)
where
w = w' .&. 0xF
-- This is all ones if the value was >= 10
theMask = (1 .&. unsafeShiftR (w - 10) 63) - 1
loSolved = w + 48
hiSolved = w + 55
toHexLower :: Word -> Word8
toHexLower w' = fromIntegral
$ (complement theMask .&. loSolved)
.|. (theMask .&. hiSolved)
where
w = w' .&. 0xF
-- This is all ones if the value was >= 10
theMask = (1 .&. unsafeShiftR (w - 10) 63) - 1
loSolved = w + 48
hiSolved = w + 87
-- | Requires exactly 16 bytes. Encodes a 64-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 16 digits. This uses
-- uppercase for the alphabetical digits. For example, this encodes the
-- number 1022 as @00000000000003FE@.
word64PaddedUpperHex :: Word64 -> Builder 16
word64PaddedUpperHex (W64# w) = word64PaddedUpperHex# w
-- | Requires exactly 8 bytes. Encodes a 32-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 8 digits. This uses
-- uppercase for the alphabetical digits.
word32PaddedUpperHex :: Word32 -> Builder 8
word32PaddedUpperHex (W32# w) = word32PaddedUpperHex# w
-- | Requires exactly 4 bytes. Encodes a 16-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 4 digits. This uses
-- uppercase for the alphabetical digits.
--
-- >>> word16PaddedUpperHex 0xab0
-- 0AB0
word16PaddedUpperHex :: Word16 -> Builder 4
word16PaddedUpperHex (W16# w) = word16PaddedUpperHex# w
-- | Requires exactly 4 bytes. Encodes a 16-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 4 digits. This uses
-- lowercase for the alphabetical digits.
--
-- >>> word16PaddedLowerHex 0xab0
-- 0ab0
word16PaddedLowerHex :: Word16 -> Builder 4
word16PaddedLowerHex (W16# w) = word16PaddedLowerHex# w
-- | Requires at most 4 bytes. Encodes a 16-bit unsigned integer as
-- hexadecimal. No leading zeroes are displayed. Letters are presented
-- in lowercase. If the number is zero, a single zero digit is used.
--
-- >>> word16LowerHex 0xab0
-- ab0
word16LowerHex :: Word16 -> Builder 4
word16LowerHex (W16# w) = word16LowerHex# w
-- | Requires at most 4 bytes. Encodes a 16-bit unsigned integer as
-- hexadecimal. No leading zeroes are displayed. Letters are presented
-- in uppercase. If the number is zero, a single zero digit is used.
--
-- >>> word16UpperHex 0xab0
-- AB0
word16UpperHex :: Word16 -> Builder 4
word16UpperHex (W16# w) = word16UpperHex# w
-- | Requires at most 2 bytes. Encodes a 8-bit unsigned integer as
-- hexadecimal. No leading zeroes are displayed. If the number is zero,
-- a single zero digit is used.
word8LowerHex :: Word8 -> Builder 2
word8LowerHex (W8# w) = word8LowerHex# w
-- | Requires exactly 2 bytes. Encodes a 8-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 2 digits. This uses
-- uppercase for the alphabetical digits.
word8PaddedUpperHex :: Word8 -> Builder 2
word8PaddedUpperHex (W8# w) = word8PaddedUpperHex# w
-- TODO: Is it actually worth unrolling this loop. I suspect that it
-- might not be. Benchmark this.
word64PaddedUpperHex# :: Word# -> Builder 16
{-# noinline word64PaddedUpperHex# #-}
word64PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexUpper (unsafeShiftR w 60))
writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 56))
writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 52))
writeByteArray arr (off + 3) (toHexUpper (unsafeShiftR w 48))
writeByteArray arr (off + 4) (toHexUpper (unsafeShiftR w 44))
writeByteArray arr (off + 5) (toHexUpper (unsafeShiftR w 40))
writeByteArray arr (off + 6) (toHexUpper (unsafeShiftR w 36))
writeByteArray arr (off + 7) (toHexUpper (unsafeShiftR w 32))
writeByteArray arr (off + 8) (toHexUpper (unsafeShiftR w 28))
writeByteArray arr (off + 9) (toHexUpper (unsafeShiftR w 24))
writeByteArray arr (off + 10) (toHexUpper (unsafeShiftR w 20))
writeByteArray arr (off + 11) (toHexUpper (unsafeShiftR w 16))
writeByteArray arr (off + 12) (toHexUpper (unsafeShiftR w 12))
writeByteArray arr (off + 13) (toHexUpper (unsafeShiftR w 8))
writeByteArray arr (off + 14) (toHexUpper (unsafeShiftR w 4))
writeByteArray arr (off + 15) (toHexUpper (unsafeShiftR w 0))
pure (off + 16)
where
w = W# w#
word32PaddedUpperHex# :: Word# -> Builder 8
{-# noinline word32PaddedUpperHex# #-}
word32PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexUpper (unsafeShiftR w 28))
writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 24))
writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 20))
writeByteArray arr (off + 3) (toHexUpper (unsafeShiftR w 16))
writeByteArray arr (off + 4) (toHexUpper (unsafeShiftR w 12))
writeByteArray arr (off + 5) (toHexUpper (unsafeShiftR w 8))
writeByteArray arr (off + 6) (toHexUpper (unsafeShiftR w 4))
writeByteArray arr (off + 7) (toHexUpper (unsafeShiftR w 0))
pure (off + 8)
where
w = W# w#
-- Not sure if it is beneficial to inline this. We just let
-- GHC make the decision. Open an issue on github if this is
-- a problem.
word16PaddedUpperHex# :: Word# -> Builder 4
word16PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexUpper (unsafeShiftR w 12))
writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 8))
writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 4))
writeByteArray arr (off + 3) (toHexUpper (unsafeShiftR w 0))
pure (off + 4)
where
w = W# w#
word16PaddedLowerHex# :: Word# -> Builder 4
word16PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexLower (unsafeShiftR w 12))
writeByteArray arr (off + 1) (toHexLower (unsafeShiftR w 8))
writeByteArray arr (off + 2) (toHexLower (unsafeShiftR w 4))
writeByteArray arr (off + 3) (toHexLower (unsafeShiftR w 0))
pure (off + 4)
where
w = W# w#
word12PaddedLowerHex# :: Word# -> Builder 3
word12PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexLower (unsafeShiftR w 8))
writeByteArray arr (off + 1) (toHexLower (unsafeShiftR w 4))
writeByteArray arr (off + 2) (toHexLower (unsafeShiftR w 0))
pure (off + 3)
where
w = W# w#
word12PaddedUpperHex# :: Word# -> Builder 3
word12PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexUpper (unsafeShiftR w 8))
writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 4))
writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 0))
pure (off + 3)
where
w = W# w#
-- Definitely want this to inline. It's maybe a dozen instructions total.
word8PaddedUpperHex# :: Word# -> Builder 2
{-# inline word8PaddedUpperHex# #-}
word8PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexUpper (unsafeShiftR w 4))
writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 0))
pure (off + 2)
where
w = W# w#
word8PaddedLowerHex# :: Word# -> Builder 2
{-# inline word8PaddedLowerHex# #-}
word8PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexLower (unsafeShiftR w 4))
writeByteArray arr (off + 1) (toHexLower (unsafeShiftR w 0))
pure (off + 2)
where
w = W# w#
word4PaddedLowerHex# :: Word# -> Builder 1
{-# inline word4PaddedLowerHex# #-}
word4PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexLower w)
pure (off + 1)
where
w = W# w#
word4PaddedUpperHex# :: Word# -> Builder 1
{-# inline word4PaddedUpperHex# #-}
word4PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexUpper w)
pure (off + 1)
where
w = W# w#
word16UpperHex# :: Word# -> Builder 4
word16UpperHex# w#
| w <= 0xF = weaken Lte.constant (word4PaddedUpperHex# w#)
| w <= 0xFF = weaken Lte.constant (word8PaddedUpperHex# w#)
| w <= 0xFFF = weaken Lte.constant (word12PaddedUpperHex# w#)
| otherwise = word16PaddedUpperHex# w#
where
w = W# w#
word16LowerHex# :: Word# -> Builder 4
word16LowerHex# w#
| w <= 0xF = weaken Lte.constant (word4PaddedLowerHex# w#)
| w <= 0xFF = weaken Lte.constant (word8PaddedLowerHex# w#)
| w <= 0xFFF = weaken Lte.constant (word12PaddedLowerHex# w#)
| otherwise = word16PaddedLowerHex# w#
where
w = W# w#
-- Precondition: argument less than 256
word8LowerHex# :: Word# -> Builder 2
word8LowerHex# w#
| w <= 0xF = weaken Lte.constant (word4PaddedLowerHex# w#)
| otherwise = weaken Lte.constant (word8PaddedLowerHex# w#)
where
w = W# w#
-- | Encode a number less than 100 as a decimal number, zero-padding it to
-- two digits. For example: 0 is encoded as @00@, 5 is encoded as @05@, and
-- 73 is encoded as @73@.
--
-- Precondition: Argument must be less than 100. Failure to satisfy this
-- precondition will not result in a segfault, but the resulting bytes are
-- undefined. The implemention uses a heuristic for division that is inaccurate
-- for large numbers.
wordPaddedDec2 :: Word -> Builder 2
wordPaddedDec2 !w = Unsafe.construct $ \arr off -> do
let d1 = approxDiv10 w
d2 = w - (10 * d1)
writeByteArray arr off (unsafeWordToWord8 (d1 + 48))
writeByteArray arr (off + 1) (unsafeWordToWord8 (d2 + 48))
pure (off + 2)
-- | Encode a number less than 1e9 as a decimal number, zero-padding it to
-- nine digits. For example: 0 is encoded as @000000000@ and 5 is encoded as
-- @000000005@.
--
-- Precondition: Argument must be less than 1e9. Failure to satisfy this
-- precondition will not result in a segfault, but the resulting bytes are
-- undefined. The implemention uses a heuristic for division that is inaccurate
-- for large numbers.
wordPaddedDec9 :: Word -> Builder 9
wordPaddedDec9 !w = Unsafe.construct $ \arr off -> do
putRem10
(putRem10 $ putRem10 $ putRem10 $ putRem10 $ putRem10 $
putRem10 $ putRem10 $ putRem10
(\_ _ _ -> pure ())
) arr (off + 8) w
pure (off + 9)
putRem10 :: (MutableByteArray s -> Int -> Word -> ST s a) -> MutableByteArray s -> Int -> Word -> ST s a
{-# inline putRem10 #-}
putRem10 andThen arr off dividend = do
let quotient = approxDiv10 dividend
remainder = dividend - (10 * quotient)
writeByteArray arr off (unsafeWordToWord8 (remainder + 48))
andThen arr (off - 1) quotient
-- | Encode an ASCII character.
-- Precondition: Input must be an ASCII character. This is not checked.
ascii :: Char -> Builder 1
ascii (C# c) = Unsafe.construct $ \(MutableByteArray arr) (I# off) -> do
primitive_ (writeCharArray# arr off c)
pure (I# (off +# 1# ))
-- | Encode a character as UTF-8. This only uses as much space as is required.
char :: Char -> Builder 4
char c
| codepoint < 0x80 = Unsafe.construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 codepoint)
pure (off + 1)
| codepoint < 0x800 = Unsafe.construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 (byteTwoOne codepoint))
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint))
return (off + 2)
| codepoint >= 0xD800 && codepoint < 0xE000 = Unsafe.construct $ \arr off -> do
-- Codepoint U+FFFD
writeByteArray arr off (0xEF :: Word8)
writeByteArray arr (off + 1) (0xBF :: Word8)
writeByteArray arr (off + 2) (0xBD :: Word8)
return (off + 3)
| codepoint < 0x10000 = Unsafe.construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 (byteThreeOne codepoint))
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint))
writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint))
return (off + 3)
| otherwise = Unsafe.construct $ \arr off -> do
writeByteArray arr off (unsafeWordToWord8 (byteFourOne codepoint))
writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint))
writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint))
writeByteArray arr (off + 3) (unsafeWordToWord8 (byteFourFour codepoint))
return (off + 4)
where
codepoint :: Word
codepoint = fromIntegral (ord c)
-- precondition: codepoint is less than 0x800
byteTwoOne :: Word -> Word
byteTwoOne w = unsafeShiftR w 6 .|. 0b11000000
byteTwoTwo :: Word -> Word
byteTwoTwo w = (w .&. 0b00111111) .|. 0b10000000
-- precondition: codepoint is less than 0x1000
byteThreeOne :: Word -> Word
byteThreeOne w = unsafeShiftR w 12 .|. 0b11100000
byteThreeTwo :: Word -> Word
byteThreeTwo w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
byteThreeThree :: Word -> Word
byteThreeThree w = (w .&. 0b00111111) .|. 0b10000000
-- precondition: codepoint is less than 0x110000
byteFourOne :: Word -> Word
byteFourOne w = unsafeShiftR w 18 .|. 0b11110000
byteFourTwo :: Word -> Word
byteFourTwo w = (0b00111111 .&. unsafeShiftR w 12) .|. 0b10000000
byteFourThree :: Word -> Word
byteFourThree w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000
byteFourFour :: Word -> Word
byteFourFour w = (0b00111111 .&. w) .|. 0b10000000
int64BE :: Int64 -> Builder 8
int64BE (I64# i) = word64BE (W64# (int2Word# i))
int32BE :: Int32 -> Builder 4
int32BE (I32# i) = word32BE (W32# (int2Word# i))
int16BE :: Int16 -> Builder 2
int16BE (I16# i) = word16BE (W16# (int2Word# i))
int64LE :: Int64 -> Builder 8
int64LE (I64# i) = word64LE (W64# (int2Word# i))
int32LE :: Int32 -> Builder 4
int32LE (I32# i) = word32LE (W32# (int2Word# i))
int16LE :: Int16 -> Builder 2
int16LE (I16# i) = word16LE (W16# (int2Word# i))
word128LE :: Word128 -> Builder 16
word128LE (Word128 hi lo) = append (word64LE lo) (word64LE hi)
word128BE :: Word128 -> Builder 16
word128BE (Word128 hi lo) = append (word64BE hi) (word64BE lo)
-- | Requires exactly 8 bytes. Dump the octets of a 64-bit
-- word in a little-endian fashion.
word64LE :: Word64 -> Builder 8
word64LE w = Unsafe.construct $ \arr off -> do
writeByteArray arr (off + 7) (fromIntegral @Word64 @Word8 (unsafeShiftR w 56))
writeByteArray arr (off + 6) (fromIntegral @Word64 @Word8 (unsafeShiftR w 48))
writeByteArray arr (off + 5) (fromIntegral @Word64 @Word8 (unsafeShiftR w 40))
writeByteArray arr (off + 4) (fromIntegral @Word64 @Word8 (unsafeShiftR w 32))
writeByteArray arr (off + 3) (fromIntegral @Word64 @Word8 (unsafeShiftR w 24))
writeByteArray arr (off + 2) (fromIntegral @Word64 @Word8 (unsafeShiftR w 16))
writeByteArray arr (off + 1) (fromIntegral @Word64 @Word8 (unsafeShiftR w 8))
writeByteArray arr (off ) (fromIntegral @Word64 @Word8 w)
pure (off + 8)
-- | Requires exactly 8 bytes. Dump the octets of a 64-bit
-- word in a big-endian fashion.
word64BE :: Word64 -> Builder 8
word64BE w = Unsafe.construct $ \arr off -> do
writeByteArray arr (off ) (fromIntegral @Word64 @Word8 (unsafeShiftR w 56))
writeByteArray arr (off + 1) (fromIntegral @Word64 @Word8 (unsafeShiftR w 48))
writeByteArray arr (off + 2) (fromIntegral @Word64 @Word8 (unsafeShiftR w 40))
writeByteArray arr (off + 3) (fromIntegral @Word64 @Word8 (unsafeShiftR w 32))
writeByteArray arr (off + 4) (fromIntegral @Word64 @Word8 (unsafeShiftR w 24))
writeByteArray arr (off + 5) (fromIntegral @Word64 @Word8 (unsafeShiftR w 16))
writeByteArray arr (off + 6) (fromIntegral @Word64 @Word8 (unsafeShiftR w 8))
writeByteArray arr (off + 7) (fromIntegral @Word64 @Word8 w)
pure (off + 8)
-- | Requires exactly 4 bytes. Dump the octets of a 32-bit
-- word in a little-endian fashion.
word32LE :: Word32 -> Builder 4
word32LE w = Unsafe.construct $ \arr off -> do
writeByteArray arr (off + 3) (fromIntegral @Word32 @Word8 (unsafeShiftR w 24))
writeByteArray arr (off + 2) (fromIntegral @Word32 @Word8 (unsafeShiftR w 16))
writeByteArray arr (off + 1) (fromIntegral @Word32 @Word8 (unsafeShiftR w 8))
writeByteArray arr (off ) (fromIntegral @Word32 @Word8 w)
pure (off + 4)
-- | Requires exactly 4 bytes. Dump the octets of a 32-bit
-- word in a big-endian fashion.
word32BE :: Word32 -> Builder 4
word32BE w = Unsafe.construct $ \arr off -> do
writeByteArray arr (off ) (fromIntegral @Word32 @Word8 (unsafeShiftR w 24))
writeByteArray arr (off + 1) (fromIntegral @Word32 @Word8 (unsafeShiftR w 16))
writeByteArray arr (off + 2) (fromIntegral @Word32 @Word8 (unsafeShiftR w 8))
writeByteArray arr (off + 3) (fromIntegral @Word32 @Word8 w)
pure (off + 4)
-- | Requires exactly 2 bytes. Dump the octets of a 16-bit
-- word in a little-endian fashion.
word16LE :: Word16 -> Builder 2
word16LE w = Unsafe.construct $ \arr off -> do
writeByteArray arr (off + 1) (fromIntegral @Word16 @Word8 (unsafeShiftR w 8))
writeByteArray arr (off ) (fromIntegral @Word16 @Word8 w)
pure (off + 2)
-- | Requires exactly 2 bytes. Dump the octets of a 16-bit
-- word in a big-endian fashion.
word16BE :: Word16 -> Builder 2
word16BE w = Unsafe.construct $ \arr off -> do
writeByteArray arr (off ) (fromIntegral @Word16 @Word8 (unsafeShiftR w 8))
writeByteArray arr (off + 1) (fromIntegral @Word16 @Word8 w)
pure (off + 2)
word8 :: Word8 -> Builder 1
word8 w = Unsafe.construct $ \arr off -> do
writeByteArray arr off w
pure (off + 1)
-- Reverse the bytes in the designated slice. This takes
-- an inclusive start offset and an inclusive end offset.
reverseBytes :: MutableByteArray s -> Int -> Int -> ST s ()
{-# inline reverseBytes #-}
reverseBytes arr begin end = go begin end where
go ixA ixB = if ixA < ixB
then do
a :: Word8 <- readByteArray arr ixA
b :: Word8 <- readByteArray arr ixB
writeByteArray arr ixA b
writeByteArray arr ixB a
go (ixA + 1) (ixB - 1)
else pure ()
c2w :: Char -> Word8
c2w = fromIntegral . ord
shrinkMutableByteArray :: MutableByteArray s -> Int -> ST s ()
shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
primitive_ (shrinkMutableByteArray# arr sz)
-- This is adapted from androider's code in https://stackoverflow.com/a/7097567
-- The checks for infinity and NaN have been removed. Note that this is a little
-- inaccurate. This is very visible when encoding a number like 2.25, which
-- is perfectly represented as an IEEE 754 floating point number but is goofed
-- up by this function.
-- If you modify this function, please take a took at the resulting core.
-- It currently performs no boxing at all, and it would be nice to keep
-- it that way.
doubleDec# :: forall s.
Double# -> MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)
{-# noinline doubleDec# #-}
doubleDec# d# marr# off# s0 = unIntST s0 $ do
let marr = MutableByteArray marr#
let d0 = D# d#
let off0 = I# off#
if d0 == 0
then do
writeByteArray marr off0 (c2w '0')
pure (off0 + 1)
else do
let neg = d0 < 0
off1 <- if neg
then do
writeByteArray marr off0 (c2w '-')
pure (off0 + 1)
else pure off0
let d1 = abs d0
let mag0 = floor (logBase10 d1) :: Int
let useExp = (mag0 >= 14 || (neg && mag0 >= 9) || mag0 <= (-9))
-- This straightforward adaptation of the C code is awkward
-- in Haskell. Binding the triple where mag1 might not even
-- get used is strange.
let !(!d2,!mag1,!mag0A) = if useExp
then
let mag0' = if mag0 < 0 then mag0 - 1 else mag0
in (d1 / (10.0 ** fromIntegral @Int @Double mag0'), mag0', 0)
else (d1,0,mag0)
let mag0B = if mag0A < 1 then 0 else mag0A
let goNum :: Double -> Int -> Int -> ST s Int
goNum !dA0 !mag !offA0 = if (dA0 > doublePrecision || mag >= 0)
then do
let weight = 10.0 ** (fromIntegral @Int @Double mag)
-- We should actually check weight with isinf here,
-- but we do not.
(dA1,offA1) <- if weight > 0
then do
-- TODO: use a better floor function
let digit = ((floor :: Double -> Int) (dA0 / weight))
let discard = fromIntegral @Int @Double digit * weight
writeByteArray marr offA0
(fromIntegral @Int @Word8 (digit + ord '0'))
pure (dA0 - discard,offA0 + 1)
else pure (dA0,offA0)
offA2 <- if mag == 0 && dA1 > 0
then do
writeByteArray marr offA1 (c2w '.')
pure (offA1 + 1)
else pure offA1
goNum dA1 (mag - 1) offA2
else pure offA0
!off2 <- goNum d2 mag0B off1
off3 <- if useExp
then do
writeByteArray marr off2 (c2w 'e')
!mag2 <- if mag1 > 0
then do
writeByteArray marr (off2 + 1) (c2w '+')
pure mag1
else do
writeByteArray marr (off2 + 1) (c2w '-')
pure (-mag1)
let goMag !mag !off = if mag > 0
then do
let (q,r) = quotRem mag 10
writeByteArray marr off (fromIntegral @Int @Word8 (ord '0' + r))
goMag q (off + 1)
else pure off
!off3 <- goMag mag2 (off2 + 2)
reverseBytes marr (off2 + 2) (off3 - 1)
pure off3
else pure off2
pure off3
doublePrecision :: Double
doublePrecision = 0.00000000000001
unIntST :: State# s -> ST s Int -> (# State# s, Int# #)
{-# inline unIntST #-}
unIntST s0 (ST f) = case f s0 of
(# s1, I# i #) -> (# s1, i #)
-- This is slightly inaccurate. I think this can actually cause
-- problems in some situations. The log10 function from C would
-- be better. The inaccuracy here cause the logarithm to be slightly
-- larger than it should be. There might actually be a simple way to
-- fix this by just using recursion to compute it. We just floor the
-- result anyway. Hmm...
logBase10 :: Double -> Double
logBase10 d = log d / 2.30258509299
-- Based on C code from https://stackoverflow.com/a/5558614
-- For numbers less than 1073741829, this gives a correct answer.
approxDiv10 :: Word -> Word
approxDiv10 !n = unsafeShiftR (0x1999999A * n) 32
unsafeWordToWord8 :: Word -> Word8
unsafeWordToWord8 (W# w) = W8# w