From d39c76a65aea93360253c0bfe1b6f1551622b116 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Mon, 13 Apr 2020 11:29:38 -0400 Subject: [PATCH] Add wordLEB128, word64LEB128, integerDec, naturalDec, and word48PaddedLowerHex --- CHANGELOG.md | 6 ++ bytebuild.cabal | 4 +- src/Data/Bytes/Builder.hs | 133 +++++++++++++++++++++++++++++- src/Data/Bytes/Builder/Bounded.hs | 73 ++++++++++++++-- test/Main.hs | 32 +++++++ 5 files changed, 240 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 515bc46..bc90dc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ Note: Prior to version 0.3.4.0, this library was named `small-bytearray-builder` is now just a compatibility shim to ease the migration process. +## 0.3.5.0 -- 2020-??-?? + +* Add `wordLEB128` and `word64LEB128`. +* Add `integerDec` and `naturalDec`. +* Add `word48PaddedLowerHex`. + ## 0.3.4.0 -- 2020-02-27 * Rename the library from `small-bytearray-builder` to `bytebuild`, and diff --git a/bytebuild.cabal b/bytebuild.cabal index e1b451c..f01dd95 100644 --- a/bytebuild.cabal +++ b/bytebuild.cabal @@ -1,6 +1,6 @@ cabal-version: 2.2 name: bytebuild -version: 0.3.4.0 +version: 0.3.5.0 synopsis: Serialize to a small byte arrays description: This is similar to the builder facilities provided by @@ -46,6 +46,7 @@ library , base >=4.12.0.0 && <5 , byteslice >=0.2 && <0.3 , bytestring >=0.10.8.2 && <0.11 + , integer-logarithms >=1.0.3 && <1.1 , natural-arithmetic >=0.1 && <0.2 , primitive-offset >=0.2 && <0.3 , primitive-unlifted >=0.1.2 && <0.2 @@ -79,6 +80,7 @@ test-suite test , primitive , primitive-unlifted >=0.1.2 , quickcheck-classes >=0.6.4 + , quickcheck-instances >=0.3.22 , tasty >=1.2.3 && <1.3 , tasty-hunit >=0.10.0.2 && <0.11 , tasty-quickcheck >=0.10.1 && <0.11 diff --git a/src/Data/Bytes/Builder.hs b/src/Data/Bytes/Builder.hs index 1089daf..ff9ec3f 100644 --- a/src/Data/Bytes/Builder.hs +++ b/src/Data/Bytes/Builder.hs @@ -3,8 +3,10 @@ {-# language DuplicateRecordFields #-} {-# language LambdaCase #-} {-# language MagicHash #-} +{-# language NumericUnderscores #-} {-# language RankNTypes #-} {-# language ScopedTypeVariables #-} +{-# language TypeApplications #-} {-# language UnboxedTuples #-} module Data.Bytes.Builder @@ -36,11 +38,13 @@ module Data.Bytes.Builder , word16Dec , word8Dec , wordDec + , naturalDec , int64Dec , int32Dec , int16Dec , int8Dec , intDec + , integerDec -- * Unsigned Words -- ** 64-bit , word64PaddedUpperHex @@ -82,6 +86,10 @@ module Data.Bytes.Builder , int64LE , int32LE , int16LE + -- **** LEB128 + , intLEB128 + , wordLEB128 + , word64LEB128 -- *** Many , word8Array -- **** Big Endian @@ -117,12 +125,13 @@ module Data.Bytes.Builder import Control.Exception (SomeException,toException) import Control.Monad.ST (ST,runST) import Control.Monad.IO.Class (MonadIO,liftIO) +import Data.Bits (unsafeShiftR,unsafeShiftL,xor,finiteBitSize) import Data.Bytes.Builder.Unsafe (Builder(Builder)) import Data.Bytes.Builder.Unsafe (BuilderState(BuilderState),pasteIO) import Data.Bytes.Builder.Unsafe (Commits(Initial,Mutable,Immutable)) import Data.Bytes.Builder.Unsafe (reverseCommitsOntoChunks) import Data.Bytes.Builder.Unsafe (commitsOntoChunks) -import Data.Bytes.Builder.Unsafe (stringUtf8,cstring) +import Data.Bytes.Builder.Unsafe (stringUtf8,cstring,fromEffect) import Data.Bytes.Builder.Unsafe (addCommitsLength,copyReverseCommits) import Data.ByteString.Short.Internal (ShortByteString(SBS)) import Data.Bytes.Chunks (Chunks(ChunksNil)) @@ -138,8 +147,13 @@ import Foreign.C.String (CStringLen) import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder) import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#)) import GHC.Exts (RealWorld,MutableByteArray#,(+#),(-#),(<#)) +import GHC.Exts ((*#)) +import GHC.Integer.Logarithms.Compat (integerLog2#) import GHC.IO (IO(IO),stToIO) +import GHC.Natural (naturalFromInteger,naturalToInteger) import GHC.ST (ST(ST)) +import GHC.Word (Word(W#),Word8(W8#)) +import Numeric.Natural (Natural) import qualified Arithmetic.Nat as Nat import qualified Arithmetic.Types as Arithmetic @@ -909,7 +923,6 @@ word32LE w = fromBounded Nat.constant (Bounded.word32LE w) word16LE :: Word16 -> Builder word16LE w = fromBounded Nat.constant (Bounded.word16LE w) - -- | Requires exactly 32 bytes. Dump the octets of a 256-bit -- word in a big-endian fashion. word256BE :: Word256 -> Builder @@ -1043,3 +1056,119 @@ indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i) c2w :: Char -> Word8 c2w = fromIntegral . ord + +-- In C, this is: (n << 1) ^ (n >> (BIT_WIDTH - 1)) +zigZagNative :: Int -> Word +zigZagNative s = fromIntegral @Int @Word + ((unsafeShiftL s 1) `xor` (unsafeShiftR s (finiteBitSize (undefined :: Word) - 1))) + +-- | Encode a signed machine-sized integer with LEB-128. This uses +-- zig-zag encoding. +intLEB128 :: Int -> Builder +intLEB128 = wordLEB128 . zigZagNative + +-- | Encode a machine-sized word with LEB-128. +wordLEB128 :: Word -> Builder +wordLEB128 w = fromBounded Nat.constant (Bounded.wordLEB128 w) + +-- | Encode a 64-bit word with LEB-128. +word64LEB128 :: Word64 -> Builder +word64LEB128 w = fromBounded Nat.constant (Bounded.word64LEB128 w) + +-- | Encode a signed arbitrary-precision integer as decimal. +-- This encoding never starts with a zero unless the argument was zero. +-- Negative numbers are preceded by a minus sign. Positive numbers +-- are not preceded by anything. +integerDec :: Integer -> Builder +integerDec !i + | i < 0 = ascii '-' <> naturalDec (naturalFromInteger (negate i)) + | otherwise = naturalDec (naturalFromInteger i) + +-- | Encodes an unsigned arbitrary-precision integer as decimal. +-- This encoding never starts with a zero unless the argument was zero. +naturalDec :: Natural -> Builder +naturalDec !n0 = fromEffect + (I# (11# +# (3# *# integerLog2# (naturalToInteger n0)))) + (\marr off -> case n0 of + 0 -> do + PM.writeByteArray marr off (0x30 :: Word8) + pure (off + 1) + _ -> go n0 marr off off + ) + where + go :: forall s. Natural -> MutableByteArray s -> Int -> Int -> ST s Int + go !n !buf !off0 !off = case quotRem n 1_000_000_000 of + (q,r) -> case q of + 0 -> do + off' <- backwardsWordLoop buf off (fromIntegral @Natural @Word r) + reverseBytes buf off0 (off' - 1) + pure off' + _ -> do + off' <- backwardsPasteWordPaddedDec9 + (fromIntegral @Natural @Word r) buf off + go q buf off0 off' + +-- Reverse the bytes in the designated slice. This takes +-- an inclusive start offset and an inclusive end offset. +reverseBytes :: MutableByteArray s -> Int -> Int -> ST s () +{-# inline reverseBytes #-} +reverseBytes arr begin end = go begin end where + go ixA ixB = if ixA < ixB + then do + a :: Word8 <- PM.readByteArray arr ixA + b :: Word8 <- PM.readByteArray arr ixB + PM.writeByteArray arr ixA b + PM.writeByteArray arr ixB a + go (ixA + 1) (ixB - 1) + else pure () + +backwardsPasteWordPaddedDec9 :: + Word -> MutableByteArray s -> Int -> ST s Int +backwardsPasteWordPaddedDec9 !w !arr !off = do + backwardsPutRem10 + (backwardsPutRem10 $ backwardsPutRem10 $ backwardsPutRem10 $ + backwardsPutRem10 $ backwardsPutRem10 $ backwardsPutRem10 $ + backwardsPutRem10 $ backwardsPutRem10 + (\_ _ _ -> pure ()) + ) arr off w + pure (off + 9) + +backwardsPutRem10 :: + (MutableByteArray s -> Int -> Word -> ST s a) + -> MutableByteArray s -> Int -> Word -> ST s a +{-# inline backwardsPutRem10 #-} +backwardsPutRem10 andThen arr off dividend = do + let quotient = approxDiv10 dividend + remainder = dividend - (10 * quotient) + PM.writeByteArray arr off (unsafeWordToWord8 (remainder + 48)) + andThen arr (off + 1) quotient + +backwardsWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int +{-# inline backwardsWordLoop #-} +backwardsWordLoop arr off0 x0 = go off0 x0 where + go !off !(x :: Word) = if x > 0 + then do + let (y,z) = quotRem x 10 + PM.writeByteArray arr off (fromIntegral (z + 0x30) :: Word8) + go (off + 1) y + else pure off + +-- Based on C code from https://stackoverflow.com/a/5558614 +-- For numbers less than 1073741829, this gives a correct answer. +approxDiv10 :: Word -> Word +approxDiv10 !n = unsafeShiftR (0x1999999A * n) 32 + +-- -- A weird beast useful for rewrite rules. Not yet used. This will +-- -- ultimately replace fromEffect and fromBounded. +-- require :: Int -> Builder +-- require !n = Builder $ \buf0 off0 len0 cs0 s0 -> +-- let !(I# req) = n +-- in case len0 >=# req of +-- 1# -> (# s0, buf0, off0, len0, cs0 #) +-- _ -> let !(I# lenX) = max 4080 (I# req) in +-- case Exts.newByteArray# lenX s0 of +-- (# sX, bufX #) -> +-- (# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #) + +unsafeWordToWord8 :: Word -> Word8 +unsafeWordToWord8 (W# w) = W8# w diff --git a/src/Data/Bytes/Builder/Bounded.hs b/src/Data/Bytes/Builder/Bounded.hs index d3d9ffd..ab00498 100644 --- a/src/Data/Bytes/Builder/Bounded.hs +++ b/src/Data/Bytes/Builder/Bounded.hs @@ -45,6 +45,8 @@ module Data.Bytes.Builder.Bounded -- ** 64-bit , word64PaddedLowerHex , word64PaddedUpperHex + -- ** 48-bit + , word48PaddedLowerHex -- ** 32-bit , word32PaddedLowerHex , word32PaddedUpperHex @@ -89,6 +91,9 @@ module Data.Bytes.Builder.Bounded , int64LE , int32LE , int16LE + -- **** LEB128 + , wordLEB128 + , word64LEB128 -- * Encode Floating-Point Types , doubleDec ) where @@ -161,7 +166,10 @@ infixr 9 `append` -- | Concatenate two builders. append :: Builder m -> Builder n -> Builder (m + n) -append (Builder f) (Builder g) = +append = unsafeAppend + +unsafeAppend :: Builder m -> Builder n -> Builder p +unsafeAppend (Builder f) (Builder g) = Builder $ \arr off0 s0 -> case f arr off0 s0 of (# s1, r #) -> g arr r s1 @@ -286,15 +294,20 @@ wordCommonDec# w# = Unsafe.construct $ \arr off0 -> if w /= 0 internalWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int {-# inline internalWordLoop #-} -internalWordLoop arr off0 x0 = go off0 x0 where +internalWordLoop arr off0 x0 = do + off1 <- backwardsWordLoop arr off0 x0 + reverseBytes arr off0 (off1 - 1) + pure off1 + +backwardsWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int +{-# inline backwardsWordLoop #-} +backwardsWordLoop arr off0 x0 = go off0 x0 where go !off !(x :: Word) = if x > 0 then do let (y,z) = quotRem x 10 writeByteArray arr off (fromIntegral (z + 0x30) :: Word8) go (off + 1) y - else do - reverseBytes arr off0 (off - 1) - pure off + else pure off -- Requires up to 20 bytes. Can be less depending on what the -- size of the argument is known to be. Unsafe. @@ -390,6 +403,14 @@ word64PaddedUpperHex (W64# w) = word64PaddedUpperHex# w word64PaddedLowerHex :: Word64 -> Builder 16 word64PaddedLowerHex (W64# w) = word64PaddedLowerHex# w +-- | Requires exactly 12 bytes. Discards the upper 16 bits of a +-- 64-bit unsigned integer and then encodes the lower 48 bits as +-- hexadecimal, zero-padding the encoding to 12 digits. This uses +-- lowercase for the alphabetical digits. For example, this encodes the +-- number 1022 as @0000000003fe@. +word48PaddedLowerHex :: Word64 -> Builder 12 +word48PaddedLowerHex (W64# w) = word48PaddedLowerHex# w + -- | Requires exactly 8 bytes. Encodes a 32-bit unsigned integer as -- hexadecimal, zero-padding the encoding to 8 digits. This uses -- uppercase for the alphabetical digits. @@ -481,6 +502,27 @@ word64PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do where w = W# w# +-- TODO: Is it actually worth unrolling this loop. I suspect that it +-- might not be. Benchmark this. +word48PaddedLowerHex# :: Word# -> Builder 12 +{-# noinline word48PaddedLowerHex# #-} +word48PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do + writeByteArray arr off (toHexLower (unsafeShiftR w 44)) + writeByteArray arr (off + 1) (toHexLower (unsafeShiftR w 40)) + writeByteArray arr (off + 2) (toHexLower (unsafeShiftR w 36)) + writeByteArray arr (off + 3) (toHexLower (unsafeShiftR w 32)) + writeByteArray arr (off + 4) (toHexLower (unsafeShiftR w 28)) + writeByteArray arr (off + 5) (toHexLower (unsafeShiftR w 24)) + writeByteArray arr (off + 6) (toHexLower (unsafeShiftR w 20)) + writeByteArray arr (off + 7) (toHexLower (unsafeShiftR w 16)) + writeByteArray arr (off + 8) (toHexLower (unsafeShiftR w 12)) + writeByteArray arr (off + 9) (toHexLower (unsafeShiftR w 8)) + writeByteArray arr (off + 10) (toHexLower (unsafeShiftR w 4)) + writeByteArray arr (off + 11) (toHexLower w) + pure (off + 12) + where + w = W# w# + -- TODO: Is it actually worth unrolling this loop. I suspect that it -- might not be. Benchmark this. word64PaddedLowerHex# :: Word# -> Builder 16 @@ -752,6 +794,22 @@ ascii6 (C# c0) (C# c1) (C# c2) (C# c3) (C# c4) (C# c5) = Unsafe.construct $ \(Mu primitive_ (writeCharArray# arr (off +# 5# ) c5) pure (I# (off +# 6# )) +-- | Encode a machine-sized word with LEB-128. +wordLEB128 :: Word -> Builder 10 +wordLEB128 (W# w) = lebCommon (W# w) + +-- | Encode a 64-bit word with LEB-128. +word64LEB128 :: Word64 -> Builder 10 +word64LEB128 (W64# w) = lebCommon (W# w) + +lebCommon :: Word -> Builder n +lebCommon !w = case quotRem w 128 of + (q,r) -> case q of + 0 -> unsafeWord8 (unsafeWordToWord8 r) + _ -> unsafeAppend + (unsafeWord8 (unsafeWordToWord8 (r .|. 0x80))) + (lebCommon q) + -- | Encode a character as UTF-8. This only uses as much space as is required. char :: Char -> Builder 4 char c @@ -913,6 +971,11 @@ word8 w = Unsafe.construct $ \arr off -> do writeByteArray arr off w pure (off + 1) +unsafeWord8 :: Word8 -> Builder n +unsafeWord8 w = Unsafe.construct $ \arr off -> do + writeByteArray arr off w + pure (off + 1) + -- Reverse the bytes in the designated slice. This takes -- an inclusive start offset and an inclusive end offset. reverseBytes :: MutableByteArray s -> Int -> Int -> ST s () diff --git a/test/Main.hs b/test/Main.hs index 6354d45..3307f6c 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -15,13 +15,17 @@ import Data.Char (ord,chr) import Data.IORef (IORef,newIORef,readIORef,writeIORef) import Data.Primitive (ByteArray) import Data.WideWord (Word128(Word128),Word256(Word256)) +import Numeric.Natural (Natural) import Test.Tasty (defaultMain,testGroup,TestTree) import Test.QuickCheck ((===),Arbitrary) +import Test.QuickCheck.Instances.Natural () import Text.Printf (printf) import Test.Tasty.HUnit ((@=?)) import qualified Arithmetic.Nat as Nat +import qualified Data.Bits as Bits import qualified Data.Bytes.Builder.Bounded as Bounded +import qualified Data.Bytes as Bytes import qualified Data.ByteString as ByteString import qualified Data.ByteString.Builder as BB import qualified Data.ByteString.Lazy.Char8 as LB @@ -215,6 +219,19 @@ tests = testGroup "Tests" in runConcat 1 (foldMap word256BE xs) === runConcat 1 (word256ArrayBE ys 0 (Prelude.length xs)) + , TQC.testProperty "word64LEB128" $ \(x :: Word64) -> + runConcat 1 (word64LEB128 x) + === + naiveLeb128 (fromIntegral x) + , TQC.testProperty "naturalDec-A" $ \(x :: Natural) -> + runConcat 1 (naturalDec x) + === + pack (show x) + , TQC.testProperty "naturalDec-B" $ \(x :: Natural) -> + let y = 1234567892345678934678987654321 * x in + runConcat 1 (naturalDec y) + === + pack (show y) ] , testGroup "alternate" [ TQC.testProperty "HexWord64" $ \x y -> @@ -306,3 +323,18 @@ zeroPadL :: Int -> String -> String zeroPadL n s | length s < n = replicate (n - length s) '0' ++ s | otherwise = s + +naiveLeb128 :: Natural -> ByteArray +naiveLeb128 x = + Bytes.toByteArray (Bytes.unsafeDrop 1 (Exts.fromList (0xFF : go [] x))) + where + go !xs !n = + let (q,r) = quotRem n 128 + r' = fromIntegral @Natural @Word8 r + w = if q == 0 + then r' + else Bits.setBit r' 7 + xs' = w : xs + in if q == 0 + then L.reverse xs' + else go xs' q