Add wordLEB128, word64LEB128, integerDec, naturalDec, and word48PaddedLowerHex

2020-04-13 11:29:38 -04:00 · 2020-04-13 11:29:38 -04:00 · d39c76a65a
commit d39c76a65a
parent 2ce46c4c4a
5 changed files with 240 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,12 @@ Note: Prior to version 0.3.4.0, this library was named
 `small-bytearray-builder` is now just a compatibility shim
 to ease the migration process.
 ## 0.3.5.0 -- 2020-??-??
 * Add `wordLEB128` and `word64LEB128`.
 * Add `integerDec` and `naturalDec`.
 * Add `word48PaddedLowerHex`.
 ## 0.3.4.0 -- 2020-02-27
 * Rename the library from `small-bytearray-builder` to `bytebuild`, and
--- a/bytebuild.cabal
+++ b/bytebuild.cabal
@ -1,6 +1,6 @@
 cabal-version: 2.2
 name: bytebuild
-version: 0.3.4.0
+version: 0.3.5.0
 synopsis: Serialize to a small byte arrays
 description:
  This is similar to the builder facilities provided by
@ -46,6 +46,7 @@ library
    , base >=4.12.0.0 && <5
    , byteslice >=0.2 && <0.3
    , bytestring >=0.10.8.2 && <0.11
    , integer-logarithms >=1.0.3 && <1.1
    , natural-arithmetic >=0.1 && <0.2
    , primitive-offset >=0.2 && <0.3
    , primitive-unlifted >=0.1.2 && <0.2
@ -79,6 +80,7 @@ test-suite test
    , primitive
    , primitive-unlifted >=0.1.2
    , quickcheck-classes >=0.6.4
    , quickcheck-instances >=0.3.22
    , tasty >=1.2.3 && <1.3
    , tasty-hunit >=0.10.0.2 && <0.11
    , tasty-quickcheck >=0.10.1 && <0.11
--- a/src/Data/Bytes/Builder.hs
+++ b/src/Data/Bytes/Builder.hs
@ -3,8 +3,10 @@
 {-# language DuplicateRecordFields #-}
 {-# language LambdaCase #-}
 {-# language MagicHash #-}
 {-# language NumericUnderscores #-}
 {-# language RankNTypes #-}
 {-# language ScopedTypeVariables #-}
 {-# language TypeApplications #-}
 {-# language UnboxedTuples #-}
 module Data.Bytes.Builder
@ -36,11 +38,13 @@ module Data.Bytes.Builder
  , word16Dec
  , word8Dec
  , wordDec
  , naturalDec
  , int64Dec
  , int32Dec
  , int16Dec
  , int8Dec
  , intDec
  , integerDec
    -- * Unsigned Words
    -- ** 64-bit
  , word64PaddedUpperHex
@ -82,6 +86,10 @@ module Data.Bytes.Builder
  , int64LE
  , int32LE
  , int16LE
    -- **** LEB128
  , intLEB128
  , wordLEB128
  , word64LEB128
    -- *** Many
  , word8Array
    -- **** Big Endian
@ -117,12 +125,13 @@ module Data.Bytes.Builder
 import Control.Exception (SomeException,toException)
 import Control.Monad.ST (ST,runST)
 import Control.Monad.IO.Class (MonadIO,liftIO)
 import Data.Bits (unsafeShiftR,unsafeShiftL,xor,finiteBitSize)
 import Data.Bytes.Builder.Unsafe (Builder(Builder))
 import Data.Bytes.Builder.Unsafe (BuilderState(BuilderState),pasteIO)
 import Data.Bytes.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
 import Data.Bytes.Builder.Unsafe (reverseCommitsOntoChunks)
 import Data.Bytes.Builder.Unsafe (commitsOntoChunks)
-import Data.Bytes.Builder.Unsafe (stringUtf8,cstring)
+import Data.Bytes.Builder.Unsafe (stringUtf8,cstring,fromEffect)
 import Data.Bytes.Builder.Unsafe (addCommitsLength,copyReverseCommits)
 import Data.ByteString.Short.Internal (ShortByteString(SBS))
 import Data.Bytes.Chunks (Chunks(ChunksNil))
@ -138,8 +147,13 @@ import Foreign.C.String (CStringLen)
 import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
 import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#))
 import GHC.Exts (RealWorld,MutableByteArray#,(+#),(-#),(<#))
 import GHC.Exts ((*#))
 import GHC.Integer.Logarithms.Compat (integerLog2#)
 import GHC.IO (IO(IO),stToIO)
 import GHC.Natural (naturalFromInteger,naturalToInteger)
 import GHC.ST (ST(ST))
 import GHC.Word (Word(W#),Word8(W8#))
 import Numeric.Natural (Natural)
 import qualified Arithmetic.Nat as Nat
 import qualified Arithmetic.Types as Arithmetic
@ -909,7 +923,6 @@ word32LE w = fromBounded Nat.constant (Bounded.word32LE w)
 word16LE :: Word16 -> Builder
 word16LE w = fromBounded Nat.constant (Bounded.word16LE w)
 -- | Requires exactly 32 bytes. Dump the octets of a 256-bit
 -- word in a big-endian fashion.
 word256BE :: Word256 -> Builder
@ -1043,3 +1056,119 @@ indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
 c2w :: Char -> Word8
 c2w = fromIntegral . ord
 -- In C, this is: (n << 1) ^ (n >> (BIT_WIDTH - 1))
 zigZagNative :: Int -> Word
 zigZagNative s = fromIntegral @Int @Word
  ((unsafeShiftL s 1) `xor` (unsafeShiftR s (finiteBitSize (undefined :: Word) - 1)))
 -- | Encode a signed machine-sized integer with LEB-128. This uses
 -- zig-zag encoding.
 intLEB128 :: Int -> Builder
 intLEB128 = wordLEB128 . zigZagNative
 -- | Encode a machine-sized word with LEB-128.
 wordLEB128 :: Word -> Builder
 wordLEB128 w = fromBounded Nat.constant (Bounded.wordLEB128 w)
 -- | Encode a 64-bit word with LEB-128.
 word64LEB128 :: Word64 -> Builder
 word64LEB128 w = fromBounded Nat.constant (Bounded.word64LEB128 w)
 -- | Encode a signed arbitrary-precision integer as decimal.
 -- This encoding never starts with a zero unless the argument was zero.
 -- Negative numbers are preceded by a minus sign. Positive numbers
 -- are not preceded by anything.
 integerDec :: Integer -> Builder
 integerDec !i
  | i < 0 = ascii '-' <> naturalDec (naturalFromInteger (negate i))
  | otherwise = naturalDec (naturalFromInteger i)
 -- | Encodes an unsigned arbitrary-precision integer as decimal.
 -- This encoding never starts with a zero unless the argument was zero.
 naturalDec :: Natural -> Builder
 naturalDec !n0 = fromEffect
  (I# (11# +# (3# *# integerLog2# (naturalToInteger n0))))
  (\marr off -> case n0 of
    0 -> do
      PM.writeByteArray marr off (0x30 :: Word8)
      pure (off + 1)
    _ -> go n0 marr off off
  )
  where
  go :: forall s. Natural -> MutableByteArray s -> Int -> Int -> ST s Int
  go !n !buf !off0 !off = case quotRem n 1_000_000_000 of
    (q,r) -> case q of
      0 -> do
        off' <- backwardsWordLoop buf off (fromIntegral @Natural @Word r)
        reverseBytes buf off0 (off' - 1)
        pure off'
      _ -> do
        off' <- backwardsPasteWordPaddedDec9
          (fromIntegral @Natural @Word r) buf off
        go q buf off0 off'
 -- Reverse the bytes in the designated slice. This takes
 -- an inclusive start offset and an inclusive end offset.
 reverseBytes :: MutableByteArray s -> Int -> Int -> ST s ()
 {-# inline reverseBytes #-}
 reverseBytes arr begin end = go begin end where
  go ixA ixB = if ixA < ixB
    then do
      a :: Word8 <- PM.readByteArray arr ixA
      b :: Word8 <- PM.readByteArray arr ixB
      PM.writeByteArray arr ixA b
      PM.writeByteArray arr ixB a
      go (ixA + 1) (ixB - 1)
    else pure ()
 backwardsPasteWordPaddedDec9 ::
  Word -> MutableByteArray s -> Int -> ST s Int
 backwardsPasteWordPaddedDec9 !w !arr !off = do
  backwardsPutRem10
    (backwardsPutRem10 $ backwardsPutRem10 $ backwardsPutRem10 $
     backwardsPutRem10 $ backwardsPutRem10 $ backwardsPutRem10 $
     backwardsPutRem10 $ backwardsPutRem10
     (\_ _ _ -> pure ())
    ) arr off w
  pure (off + 9)
 backwardsPutRem10 ::
     (MutableByteArray s -> Int -> Word -> ST s a)
  -> MutableByteArray s -> Int -> Word -> ST s a
 {-# inline backwardsPutRem10 #-}
 backwardsPutRem10 andThen arr off dividend = do
  let quotient = approxDiv10 dividend
      remainder = dividend - (10 * quotient)
  PM.writeByteArray arr off (unsafeWordToWord8 (remainder + 48))
  andThen arr (off + 1) quotient
 backwardsWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
 {-# inline backwardsWordLoop #-}
 backwardsWordLoop arr off0 x0 = go off0 x0 where
  go !off !(x :: Word) = if x > 0
    then do
      let (y,z) = quotRem x 10
      PM.writeByteArray arr off (fromIntegral (z + 0x30) :: Word8)
      go (off + 1) y
    else pure off
 -- Based on C code from https://stackoverflow.com/a/5558614
 -- For numbers less than 1073741829, this gives a correct answer.
 approxDiv10 :: Word -> Word
 approxDiv10 !n = unsafeShiftR (0x1999999A * n) 32
 -- -- A weird beast useful for rewrite rules. Not yet used. This will
 -- -- ultimately replace fromEffect and fromBounded.
 -- require :: Int -> Builder
 -- require !n = Builder $ \buf0 off0 len0 cs0 s0 ->
 --   let !(I# req) = n
 --    in case len0 >=# req of
 --         1# -> (# s0, buf0, off0, len0, cs0 #)
 --         _ -> let !(I# lenX) = max 4080 (I# req) in
 --           case Exts.newByteArray# lenX s0 of
 --             (# sX, bufX #) ->
 --               (# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
 unsafeWordToWord8 :: Word -> Word8
 unsafeWordToWord8 (W# w) = W8# w
--- a/src/Data/Bytes/Builder/Bounded.hs
+++ b/src/Data/Bytes/Builder/Bounded.hs
@ -45,6 +45,8 @@ module Data.Bytes.Builder.Bounded
    -- ** 64-bit
  , word64PaddedLowerHex
  , word64PaddedUpperHex
    -- ** 48-bit
  , word48PaddedLowerHex
    -- ** 32-bit
  , word32PaddedLowerHex
  , word32PaddedUpperHex
@ -89,6 +91,9 @@ module Data.Bytes.Builder.Bounded
  , int64LE
  , int32LE
  , int16LE
    -- **** LEB128
  , wordLEB128
  , word64LEB128
    -- * Encode Floating-Point Types
  , doubleDec
  ) where
@ -161,7 +166,10 @@ infixr 9 `append`
 -- | Concatenate two builders.
 append :: Builder m -> Builder n -> Builder (m + n)
-append (Builder f) (Builder g) =
+append = unsafeAppend
 unsafeAppend :: Builder m -> Builder n -> Builder p
 unsafeAppend (Builder f) (Builder g) =
  Builder $ \arr off0 s0 -> case f arr off0 s0 of
    (# s1, r #) -> g arr r s1
@ -286,15 +294,20 @@ wordCommonDec# w# = Unsafe.construct $ \arr off0 -> if w /= 0
 internalWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
 {-# inline internalWordLoop #-}
-internalWordLoop arr off0 x0 = go off0 x0 where
+internalWordLoop arr off0 x0 = do
  off1 <- backwardsWordLoop arr off0 x0
  reverseBytes arr off0 (off1 - 1)
  pure off1
 backwardsWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
 {-# inline backwardsWordLoop #-}
 backwardsWordLoop arr off0 x0 = go off0 x0 where
  go !off !(x :: Word) = if x > 0
    then do
      let (y,z) = quotRem x 10
      writeByteArray arr off (fromIntegral (z + 0x30) :: Word8)
      go (off + 1) y
-    else do
+    else pure off
      reverseBytes arr off0 (off - 1)
      pure off
 -- Requires up to 20 bytes. Can be less depending on what the
 -- size of the argument is known to be. Unsafe.
@ -390,6 +403,14 @@ word64PaddedUpperHex (W64# w) = word64PaddedUpperHex# w
 word64PaddedLowerHex :: Word64 -> Builder 16
 word64PaddedLowerHex (W64# w) = word64PaddedLowerHex# w
 -- | Requires exactly 12 bytes. Discards the upper 16 bits of a
 -- 64-bit unsigned integer and then encodes the lower 48 bits as
 -- hexadecimal, zero-padding the encoding to 12 digits. This uses
 -- lowercase for the alphabetical digits. For example, this encodes the
 -- number 1022 as @0000000003fe@.
 word48PaddedLowerHex :: Word64 -> Builder 12
 word48PaddedLowerHex (W64# w) = word48PaddedLowerHex# w
 -- | Requires exactly 8 bytes. Encodes a 32-bit unsigned integer as
 -- hexadecimal, zero-padding the encoding to 8 digits. This uses
 -- uppercase for the alphabetical digits.
@ -481,6 +502,27 @@ word64PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
  where
  w = W# w#
 -- TODO: Is it actually worth unrolling this loop. I suspect that it
 -- might not be. Benchmark this.
 word48PaddedLowerHex# :: Word# -> Builder 12
 {-# noinline word48PaddedLowerHex# #-}
 word48PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexLower (unsafeShiftR w 44))
  writeByteArray arr (off + 1) (toHexLower (unsafeShiftR w 40))
  writeByteArray arr (off + 2) (toHexLower (unsafeShiftR w 36))
  writeByteArray arr (off + 3) (toHexLower (unsafeShiftR w 32))
  writeByteArray arr (off + 4) (toHexLower (unsafeShiftR w 28))
  writeByteArray arr (off + 5) (toHexLower (unsafeShiftR w 24))
  writeByteArray arr (off + 6) (toHexLower (unsafeShiftR w 20))
  writeByteArray arr (off + 7) (toHexLower (unsafeShiftR w 16))
  writeByteArray arr (off + 8) (toHexLower (unsafeShiftR w 12))
  writeByteArray arr (off + 9) (toHexLower (unsafeShiftR w 8))
  writeByteArray arr (off + 10) (toHexLower (unsafeShiftR w 4))
  writeByteArray arr (off + 11) (toHexLower w)
  pure (off + 12)
  where
  w = W# w#
 -- TODO: Is it actually worth unrolling this loop. I suspect that it
 -- might not be. Benchmark this.
 word64PaddedLowerHex# :: Word# -> Builder 16
@ -752,6 +794,22 @@ ascii6 (C# c0) (C# c1) (C# c2) (C# c3) (C# c4) (C# c5) = Unsafe.construct $ \(Mu
  primitive_ (writeCharArray# arr (off +# 5# ) c5)
  pure (I# (off +# 6# ))
 -- | Encode a machine-sized word with LEB-128.
 wordLEB128 :: Word -> Builder 10
 wordLEB128 (W# w) = lebCommon (W# w)
 -- | Encode a 64-bit word with LEB-128.
 word64LEB128 :: Word64 -> Builder 10
 word64LEB128 (W64# w) = lebCommon (W# w)
 lebCommon :: Word -> Builder n
 lebCommon !w = case quotRem w 128 of
  (q,r) -> case q of
    0 -> unsafeWord8 (unsafeWordToWord8 r)
    _ -> unsafeAppend
      (unsafeWord8 (unsafeWordToWord8 (r .|. 0x80)))
      (lebCommon q)
 -- | Encode a character as UTF-8. This only uses as much space as is required.
 char :: Char -> Builder 4
 char c
@ -913,6 +971,11 @@ word8 w = Unsafe.construct $ \arr off -> do
  writeByteArray arr off w
  pure (off + 1)
 unsafeWord8 :: Word8 -> Builder n
 unsafeWord8 w = Unsafe.construct $ \arr off -> do
  writeByteArray arr off w
  pure (off + 1)
 -- Reverse the bytes in the designated slice. This takes
 -- an inclusive start offset and an inclusive end offset.
 reverseBytes :: MutableByteArray s -> Int -> Int -> ST s ()
--- a/test/Main.hs
+++ b/test/Main.hs
@ -15,13 +15,17 @@ import Data.Char (ord,chr)
 import Data.IORef (IORef,newIORef,readIORef,writeIORef)
 import Data.Primitive (ByteArray)
 import Data.WideWord (Word128(Word128),Word256(Word256))
 import Numeric.Natural (Natural)
 import Test.Tasty (defaultMain,testGroup,TestTree)
 import Test.QuickCheck ((===),Arbitrary)
 import Test.QuickCheck.Instances.Natural ()
 import Text.Printf (printf)
 import Test.Tasty.HUnit ((@=?))
 import qualified Arithmetic.Nat as Nat
 import qualified Data.Bits as Bits
 import qualified Data.Bytes.Builder.Bounded as Bounded
 import qualified Data.Bytes as Bytes
 import qualified Data.ByteString as ByteString
 import qualified Data.ByteString.Builder as BB
 import qualified Data.ByteString.Lazy.Char8 as LB
@ -215,6 +219,19 @@ tests = testGroup "Tests"
         in runConcat 1 (foldMap word256BE xs)
            ===
            runConcat 1 (word256ArrayBE ys 0 (Prelude.length xs))
    , TQC.testProperty "word64LEB128" $ \(x :: Word64) ->
        runConcat 1 (word64LEB128 x)
        ===
        naiveLeb128 (fromIntegral x)
    , TQC.testProperty "naturalDec-A" $ \(x :: Natural) ->
        runConcat 1 (naturalDec x)
        ===
        pack (show x)
    , TQC.testProperty "naturalDec-B" $ \(x :: Natural) ->
        let y = 1234567892345678934678987654321 * x in
        runConcat 1 (naturalDec y)
        ===
        pack (show y)
    ]
  , testGroup "alternate"
    [ TQC.testProperty "HexWord64" $ \x y ->
@ -306,3 +323,18 @@ zeroPadL :: Int -> String -> String
 zeroPadL n s
  | length s < n = replicate (n - length s) '0' ++ s
  | otherwise = s
 naiveLeb128 :: Natural -> ByteArray
 naiveLeb128 x =
  Bytes.toByteArray (Bytes.unsafeDrop 1 (Exts.fromList (0xFF : go [] x)))
  where
  go !xs !n =
    let (q,r) = quotRem n 128
        r' = fromIntegral @Natural @Word8 r
        w = if q == 0
          then r'
          else Bits.setBit r' 7
        xs' = w : xs
     in if q == 0 
          then L.reverse xs'
          else go xs' q