Add wordLEB128, word64LEB128, integerDec, naturalDec, and word48PaddedLowerHex

2020-04-13 11:29:38 -04:00 · 2020-04-13 11:29:38 -04:00 · d39c76a65a
commit d39c76a65a
parent 2ce46c4c4a
5 changed files with 240 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,12 @@ Note: Prior to version 0.3.4.0, this library was named
 `small-bytearray-builder` is now just a compatibility shim
 to ease the migration process.

+## 0.3.5.0 -- 2020-??-??
+
+* Add `wordLEB128` and `word64LEB128`.
+* Add `integerDec` and `naturalDec`.
+* Add `word48PaddedLowerHex`.
+
 ## 0.3.4.0 -- 2020-02-27

 * Rename the library from `small-bytearray-builder` to `bytebuild`, and
--- a/bytebuild.cabal
+++ b/bytebuild.cabal
@ -1,6 +1,6 @@
 cabal-version: 2.2
 name: bytebuild
-version: 0.3.4.0
+version: 0.3.5.0
 synopsis: Serialize to a small byte arrays
 description:
  This is similar to the builder facilities provided by
@ -46,6 +46,7 @@ library
    , base >=4.12.0.0 && <5
    , byteslice >=0.2 && <0.3
    , bytestring >=0.10.8.2 && <0.11
+    , integer-logarithms >=1.0.3 && <1.1
    , natural-arithmetic >=0.1 && <0.2
    , primitive-offset >=0.2 && <0.3
    , primitive-unlifted >=0.1.2 && <0.2
@ -79,6 +80,7 @@ test-suite test
    , primitive
    , primitive-unlifted >=0.1.2
    , quickcheck-classes >=0.6.4
+    , quickcheck-instances >=0.3.22
    , tasty >=1.2.3 && <1.3
    , tasty-hunit >=0.10.0.2 && <0.11
    , tasty-quickcheck >=0.10.1 && <0.11
--- a/src/Data/Bytes/Builder.hs
+++ b/src/Data/Bytes/Builder.hs
@ -3,8 +3,10 @@
 {-# language DuplicateRecordFields #-}
 {-# language LambdaCase #-}
 {-# language MagicHash #-}
+{-# language NumericUnderscores #-}
 {-# language RankNTypes #-}
 {-# language ScopedTypeVariables #-}
+{-# language TypeApplications #-}
 {-# language UnboxedTuples #-}

 module Data.Bytes.Builder
@ -36,11 +38,13 @@ module Data.Bytes.Builder
  , word16Dec
  , word8Dec
  , wordDec
+  , naturalDec
  , int64Dec
  , int32Dec
  , int16Dec
  , int8Dec
  , intDec
+  , integerDec
    -- * Unsigned Words
    -- ** 64-bit
  , word64PaddedUpperHex
@ -82,6 +86,10 @@ module Data.Bytes.Builder
  , int64LE
  , int32LE
  , int16LE
+    -- **** LEB128
+  , intLEB128
+  , wordLEB128
+  , word64LEB128
    -- *** Many
  , word8Array
    -- **** Big Endian
@ -117,12 +125,13 @@ module Data.Bytes.Builder
 import Control.Exception (SomeException,toException)
 import Control.Monad.ST (ST,runST)
 import Control.Monad.IO.Class (MonadIO,liftIO)
+import Data.Bits (unsafeShiftR,unsafeShiftL,xor,finiteBitSize)
 import Data.Bytes.Builder.Unsafe (Builder(Builder))
 import Data.Bytes.Builder.Unsafe (BuilderState(BuilderState),pasteIO)
 import Data.Bytes.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
 import Data.Bytes.Builder.Unsafe (reverseCommitsOntoChunks)
 import Data.Bytes.Builder.Unsafe (commitsOntoChunks)
-import Data.Bytes.Builder.Unsafe (stringUtf8,cstring)
+import Data.Bytes.Builder.Unsafe (stringUtf8,cstring,fromEffect)
 import Data.Bytes.Builder.Unsafe (addCommitsLength,copyReverseCommits)
 import Data.ByteString.Short.Internal (ShortByteString(SBS))
 import Data.Bytes.Chunks (Chunks(ChunksNil))
@ -138,8 +147,13 @@ import Foreign.C.String (CStringLen)
 import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
 import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#))
 import GHC.Exts (RealWorld,MutableByteArray#,(+#),(-#),(<#))
+import GHC.Exts ((*#))
+import GHC.Integer.Logarithms.Compat (integerLog2#)
 import GHC.IO (IO(IO),stToIO)
+import GHC.Natural (naturalFromInteger,naturalToInteger)
 import GHC.ST (ST(ST))
+import GHC.Word (Word(W#),Word8(W8#))
+import Numeric.Natural (Natural)

 import qualified Arithmetic.Nat as Nat
 import qualified Arithmetic.Types as Arithmetic
@ -909,7 +923,6 @@ word32LE w = fromBounded Nat.constant (Bounded.word32LE w)
 word16LE :: Word16 -> Builder
 word16LE w = fromBounded Nat.constant (Bounded.word16LE w)

-
 -- | Requires exactly 32 bytes. Dump the octets of a 256-bit
 -- word in a big-endian fashion.
 word256BE :: Word256 -> Builder
@ -1043,3 +1056,119 @@ indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)

 c2w :: Char -> Word8
 c2w = fromIntegral . ord
+
+-- In C, this is: (n << 1) ^ (n >> (BIT_WIDTH - 1))
+zigZagNative :: Int -> Word
+zigZagNative s = fromIntegral @Int @Word
+  ((unsafeShiftL s 1) `xor` (unsafeShiftR s (finiteBitSize (undefined :: Word) - 1)))
+
+-- | Encode a signed machine-sized integer with LEB-128. This uses
+-- zig-zag encoding.
+intLEB128 :: Int -> Builder
+intLEB128 = wordLEB128 . zigZagNative
+
+-- | Encode a machine-sized word with LEB-128.
+wordLEB128 :: Word -> Builder
+wordLEB128 w = fromBounded Nat.constant (Bounded.wordLEB128 w)
+
+-- | Encode a 64-bit word with LEB-128.
+word64LEB128 :: Word64 -> Builder
+word64LEB128 w = fromBounded Nat.constant (Bounded.word64LEB128 w)
+
+-- | Encode a signed arbitrary-precision integer as decimal.
+-- This encoding never starts with a zero unless the argument was zero.
+-- Negative numbers are preceded by a minus sign. Positive numbers
+-- are not preceded by anything.
+integerDec :: Integer -> Builder
+integerDec !i
+  | i < 0 = ascii '-' <> naturalDec (naturalFromInteger (negate i))
+  | otherwise = naturalDec (naturalFromInteger i)
+
+-- | Encodes an unsigned arbitrary-precision integer as decimal.
+-- This encoding never starts with a zero unless the argument was zero.
+naturalDec :: Natural -> Builder
+naturalDec !n0 = fromEffect
+  (I# (11# +# (3# *# integerLog2# (naturalToInteger n0))))
+  (\marr off -> case n0 of
+    0 -> do
+      PM.writeByteArray marr off (0x30 :: Word8)
+      pure (off + 1)
+    _ -> go n0 marr off off
+  )
+  where
+  go :: forall s. Natural -> MutableByteArray s -> Int -> Int -> ST s Int
+  go !n !buf !off0 !off = case quotRem n 1_000_000_000 of
+    (q,r) -> case q of
+      0 -> do
+        off' <- backwardsWordLoop buf off (fromIntegral @Natural @Word r)
+        reverseBytes buf off0 (off' - 1)
+        pure off'
+      _ -> do
+        off' <- backwardsPasteWordPaddedDec9
+          (fromIntegral @Natural @Word r) buf off
+        go q buf off0 off'
+
+-- Reverse the bytes in the designated slice. This takes
+-- an inclusive start offset and an inclusive end offset.
+reverseBytes :: MutableByteArray s -> Int -> Int -> ST s ()
+{-# inline reverseBytes #-}
+reverseBytes arr begin end = go begin end where
+  go ixA ixB = if ixA < ixB
+    then do
+      a :: Word8 <- PM.readByteArray arr ixA
+      b :: Word8 <- PM.readByteArray arr ixB
+      PM.writeByteArray arr ixA b
+      PM.writeByteArray arr ixB a
+      go (ixA + 1) (ixB - 1)
+    else pure ()
+
+backwardsPasteWordPaddedDec9 ::
+  Word -> MutableByteArray s -> Int -> ST s Int
+backwardsPasteWordPaddedDec9 !w !arr !off = do
+  backwardsPutRem10
+    (backwardsPutRem10 $ backwardsPutRem10 $ backwardsPutRem10 $
+     backwardsPutRem10 $ backwardsPutRem10 $ backwardsPutRem10 $
+     backwardsPutRem10 $ backwardsPutRem10
+     (\_ _ _ -> pure ())
+    ) arr off w
+  pure (off + 9)
+
+backwardsPutRem10 ::
+     (MutableByteArray s -> Int -> Word -> ST s a)
+  -> MutableByteArray s -> Int -> Word -> ST s a
+{-# inline backwardsPutRem10 #-}
+backwardsPutRem10 andThen arr off dividend = do
+  let quotient = approxDiv10 dividend
+      remainder = dividend - (10 * quotient)
+  PM.writeByteArray arr off (unsafeWordToWord8 (remainder + 48))
+  andThen arr (off + 1) quotient
+
+backwardsWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
+{-# inline backwardsWordLoop #-}
+backwardsWordLoop arr off0 x0 = go off0 x0 where
+  go !off !(x :: Word) = if x > 0
+    then do
+      let (y,z) = quotRem x 10
+      PM.writeByteArray arr off (fromIntegral (z + 0x30) :: Word8)
+      go (off + 1) y
+    else pure off
+
+-- Based on C code from https://stackoverflow.com/a/5558614
+-- For numbers less than 1073741829, this gives a correct answer.
+approxDiv10 :: Word -> Word
+approxDiv10 !n = unsafeShiftR (0x1999999A * n) 32
+  
+-- -- A weird beast useful for rewrite rules. Not yet used. This will
+-- -- ultimately replace fromEffect and fromBounded.
+-- require :: Int -> Builder
+-- require !n = Builder $ \buf0 off0 len0 cs0 s0 ->
+--   let !(I# req) = n
+--    in case len0 >=# req of
+--         1# -> (# s0, buf0, off0, len0, cs0 #)
+--         _ -> let !(I# lenX) = max 4080 (I# req) in
+--           case Exts.newByteArray# lenX s0 of
+--             (# sX, bufX #) ->
+--               (# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
+
+unsafeWordToWord8 :: Word -> Word8
+unsafeWordToWord8 (W# w) = W8# w
--- a/src/Data/Bytes/Builder/Bounded.hs
+++ b/src/Data/Bytes/Builder/Bounded.hs
@ -45,6 +45,8 @@ module Data.Bytes.Builder.Bounded
    -- ** 64-bit
  , word64PaddedLowerHex
  , word64PaddedUpperHex
+    -- ** 48-bit
+  , word48PaddedLowerHex
    -- ** 32-bit
  , word32PaddedLowerHex
  , word32PaddedUpperHex
@ -89,6 +91,9 @@ module Data.Bytes.Builder.Bounded
  , int64LE
  , int32LE
  , int16LE
+    -- **** LEB128
+  , wordLEB128
+  , word64LEB128
    -- * Encode Floating-Point Types
  , doubleDec
  ) where
@ -161,7 +166,10 @@ infixr 9 `append`

 -- | Concatenate two builders.
 append :: Builder m -> Builder n -> Builder (m + n)
-append (Builder f) (Builder g) =
+append = unsafeAppend
+
+unsafeAppend :: Builder m -> Builder n -> Builder p
+unsafeAppend (Builder f) (Builder g) =
  Builder $ \arr off0 s0 -> case f arr off0 s0 of
    (# s1, r #) -> g arr r s1

@ -286,15 +294,20 @@ wordCommonDec# w# = Unsafe.construct $ \arr off0 -> if w /= 0

 internalWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
 {-# inline internalWordLoop #-}
-internalWordLoop arr off0 x0 = go off0 x0 where
+internalWordLoop arr off0 x0 = do
+  off1 <- backwardsWordLoop arr off0 x0
+  reverseBytes arr off0 (off1 - 1)
+  pure off1
+
+backwardsWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
+{-# inline backwardsWordLoop #-}
+backwardsWordLoop arr off0 x0 = go off0 x0 where
  go !off !(x :: Word) = if x > 0
    then do
      let (y,z) = quotRem x 10
      writeByteArray arr off (fromIntegral (z + 0x30) :: Word8)
      go (off + 1) y
-    else do
-      reverseBytes arr off0 (off - 1)
-      pure off
+    else pure off

 -- Requires up to 20 bytes. Can be less depending on what the
 -- size of the argument is known to be. Unsafe.
@ -390,6 +403,14 @@ word64PaddedUpperHex (W64# w) = word64PaddedUpperHex# w
 word64PaddedLowerHex :: Word64 -> Builder 16
 word64PaddedLowerHex (W64# w) = word64PaddedLowerHex# w

+-- | Requires exactly 12 bytes. Discards the upper 16 bits of a
+-- 64-bit unsigned integer and then encodes the lower 48 bits as
+-- hexadecimal, zero-padding the encoding to 12 digits. This uses
+-- lowercase for the alphabetical digits. For example, this encodes the
+-- number 1022 as @0000000003fe@.
+word48PaddedLowerHex :: Word64 -> Builder 12
+word48PaddedLowerHex (W64# w) = word48PaddedLowerHex# w
+
 -- | Requires exactly 8 bytes. Encodes a 32-bit unsigned integer as
 -- hexadecimal, zero-padding the encoding to 8 digits. This uses
 -- uppercase for the alphabetical digits.
@ -481,6 +502,27 @@ word64PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
  where
  w = W# w#

+-- TODO: Is it actually worth unrolling this loop. I suspect that it
+-- might not be. Benchmark this.
+word48PaddedLowerHex# :: Word# -> Builder 12
+{-# noinline word48PaddedLowerHex# #-}
+word48PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
+  writeByteArray arr off (toHexLower (unsafeShiftR w 44))
+  writeByteArray arr (off + 1) (toHexLower (unsafeShiftR w 40))
+  writeByteArray arr (off + 2) (toHexLower (unsafeShiftR w 36))
+  writeByteArray arr (off + 3) (toHexLower (unsafeShiftR w 32))
+  writeByteArray arr (off + 4) (toHexLower (unsafeShiftR w 28))
+  writeByteArray arr (off + 5) (toHexLower (unsafeShiftR w 24))
+  writeByteArray arr (off + 6) (toHexLower (unsafeShiftR w 20))
+  writeByteArray arr (off + 7) (toHexLower (unsafeShiftR w 16))
+  writeByteArray arr (off + 8) (toHexLower (unsafeShiftR w 12))
+  writeByteArray arr (off + 9) (toHexLower (unsafeShiftR w 8))
+  writeByteArray arr (off + 10) (toHexLower (unsafeShiftR w 4))
+  writeByteArray arr (off + 11) (toHexLower w)
+  pure (off + 12)
+  where
+  w = W# w#
+
 -- TODO: Is it actually worth unrolling this loop. I suspect that it
 -- might not be. Benchmark this.
 word64PaddedLowerHex# :: Word# -> Builder 16
@ -752,6 +794,22 @@ ascii6 (C# c0) (C# c1) (C# c2) (C# c3) (C# c4) (C# c5) = Unsafe.construct $ \(Mu
  primitive_ (writeCharArray# arr (off +# 5# ) c5)
  pure (I# (off +# 6# ))

+-- | Encode a machine-sized word with LEB-128.
+wordLEB128 :: Word -> Builder 10
+wordLEB128 (W# w) = lebCommon (W# w)
+
+-- | Encode a 64-bit word with LEB-128.
+word64LEB128 :: Word64 -> Builder 10
+word64LEB128 (W64# w) = lebCommon (W# w)
+
+lebCommon :: Word -> Builder n
+lebCommon !w = case quotRem w 128 of
+  (q,r) -> case q of
+    0 -> unsafeWord8 (unsafeWordToWord8 r)
+    _ -> unsafeAppend
+      (unsafeWord8 (unsafeWordToWord8 (r .|. 0x80)))
+      (lebCommon q)
+
 -- | Encode a character as UTF-8. This only uses as much space as is required.
 char :: Char -> Builder 4
 char c
@ -913,6 +971,11 @@ word8 w = Unsafe.construct $ \arr off -> do
  writeByteArray arr off w
  pure (off + 1)

+unsafeWord8 :: Word8 -> Builder n
+unsafeWord8 w = Unsafe.construct $ \arr off -> do
+  writeByteArray arr off w
+  pure (off + 1)
+
 -- Reverse the bytes in the designated slice. This takes
 -- an inclusive start offset and an inclusive end offset.
 reverseBytes :: MutableByteArray s -> Int -> Int -> ST s ()
--- a/test/Main.hs
+++ b/test/Main.hs
@ -15,13 +15,17 @@ import Data.Char (ord,chr)
 import Data.IORef (IORef,newIORef,readIORef,writeIORef)
 import Data.Primitive (ByteArray)
 import Data.WideWord (Word128(Word128),Word256(Word256))
+import Numeric.Natural (Natural)
 import Test.Tasty (defaultMain,testGroup,TestTree)
 import Test.QuickCheck ((===),Arbitrary)
+import Test.QuickCheck.Instances.Natural ()
 import Text.Printf (printf)
 import Test.Tasty.HUnit ((@=?))

 import qualified Arithmetic.Nat as Nat
+import qualified Data.Bits as Bits
 import qualified Data.Bytes.Builder.Bounded as Bounded
+import qualified Data.Bytes as Bytes
 import qualified Data.ByteString as ByteString
 import qualified Data.ByteString.Builder as BB
 import qualified Data.ByteString.Lazy.Char8 as LB
@ -215,6 +219,19 @@ tests = testGroup "Tests"
         in runConcat 1 (foldMap word256BE xs)
            ===
            runConcat 1 (word256ArrayBE ys 0 (Prelude.length xs))
+    , TQC.testProperty "word64LEB128" $ \(x :: Word64) ->
+        runConcat 1 (word64LEB128 x)
+        ===
+        naiveLeb128 (fromIntegral x)
+    , TQC.testProperty "naturalDec-A" $ \(x :: Natural) ->
+        runConcat 1 (naturalDec x)
+        ===
+        pack (show x)
+    , TQC.testProperty "naturalDec-B" $ \(x :: Natural) ->
+        let y = 1234567892345678934678987654321 * x in
+        runConcat 1 (naturalDec y)
+        ===
+        pack (show y)
    ]
  , testGroup "alternate"
    [ TQC.testProperty "HexWord64" $ \x y ->
@ -306,3 +323,18 @@ zeroPadL :: Int -> String -> String
 zeroPadL n s
  | length s < n = replicate (n - length s) '0' ++ s
  | otherwise = s
+
+naiveLeb128 :: Natural -> ByteArray
+naiveLeb128 x =
+  Bytes.toByteArray (Bytes.unsafeDrop 1 (Exts.fromList (0xFF : go [] x)))
+  where
+  go !xs !n =
+    let (q,r) = quotRem n 128
+        r' = fromIntegral @Natural @Word8 r
+        w = if q == 0
+          then r'
+          else Bits.setBit r' 7
+        xs' = w : xs
+     in if q == 0 
+          then L.reverse xs'
+          else go xs' q