Add wordLEB128, word64LEB128, integerDec, naturalDec, and word48PaddedLowerHex

This commit is contained in:
Andrew Martin 2020-04-13 11:29:38 -04:00
parent 2ce46c4c4a
commit d39c76a65a
5 changed files with 240 additions and 8 deletions

View file

@ -5,6 +5,12 @@ Note: Prior to version 0.3.4.0, this library was named
`small-bytearray-builder` is now just a compatibility shim
to ease the migration process.
## 0.3.5.0 -- 2020-??-??
* Add `wordLEB128` and `word64LEB128`.
* Add `integerDec` and `naturalDec`.
* Add `word48PaddedLowerHex`.
## 0.3.4.0 -- 2020-02-27
* Rename the library from `small-bytearray-builder` to `bytebuild`, and

View file

@ -1,6 +1,6 @@
cabal-version: 2.2
name: bytebuild
version: 0.3.4.0
version: 0.3.5.0
synopsis: Serialize to a small byte arrays
description:
This is similar to the builder facilities provided by
@ -46,6 +46,7 @@ library
, base >=4.12.0.0 && <5
, byteslice >=0.2 && <0.3
, bytestring >=0.10.8.2 && <0.11
, integer-logarithms >=1.0.3 && <1.1
, natural-arithmetic >=0.1 && <0.2
, primitive-offset >=0.2 && <0.3
, primitive-unlifted >=0.1.2 && <0.2
@ -79,6 +80,7 @@ test-suite test
, primitive
, primitive-unlifted >=0.1.2
, quickcheck-classes >=0.6.4
, quickcheck-instances >=0.3.22
, tasty >=1.2.3 && <1.3
, tasty-hunit >=0.10.0.2 && <0.11
, tasty-quickcheck >=0.10.1 && <0.11

View file

@ -3,8 +3,10 @@
{-# language DuplicateRecordFields #-}
{-# language LambdaCase #-}
{-# language MagicHash #-}
{-# language NumericUnderscores #-}
{-# language RankNTypes #-}
{-# language ScopedTypeVariables #-}
{-# language TypeApplications #-}
{-# language UnboxedTuples #-}
module Data.Bytes.Builder
@ -36,11 +38,13 @@ module Data.Bytes.Builder
, word16Dec
, word8Dec
, wordDec
, naturalDec
, int64Dec
, int32Dec
, int16Dec
, int8Dec
, intDec
, integerDec
-- * Unsigned Words
-- ** 64-bit
, word64PaddedUpperHex
@ -82,6 +86,10 @@ module Data.Bytes.Builder
, int64LE
, int32LE
, int16LE
-- **** LEB128
, intLEB128
, wordLEB128
, word64LEB128
-- *** Many
, word8Array
-- **** Big Endian
@ -117,12 +125,13 @@ module Data.Bytes.Builder
import Control.Exception (SomeException,toException)
import Control.Monad.ST (ST,runST)
import Control.Monad.IO.Class (MonadIO,liftIO)
import Data.Bits (unsafeShiftR,unsafeShiftL,xor,finiteBitSize)
import Data.Bytes.Builder.Unsafe (Builder(Builder))
import Data.Bytes.Builder.Unsafe (BuilderState(BuilderState),pasteIO)
import Data.Bytes.Builder.Unsafe (Commits(Initial,Mutable,Immutable))
import Data.Bytes.Builder.Unsafe (reverseCommitsOntoChunks)
import Data.Bytes.Builder.Unsafe (commitsOntoChunks)
import Data.Bytes.Builder.Unsafe (stringUtf8,cstring)
import Data.Bytes.Builder.Unsafe (stringUtf8,cstring,fromEffect)
import Data.Bytes.Builder.Unsafe (addCommitsLength,copyReverseCommits)
import Data.ByteString.Short.Internal (ShortByteString(SBS))
import Data.Bytes.Chunks (Chunks(ChunksNil))
@ -138,8 +147,13 @@ import Foreign.C.String (CStringLen)
import GHC.ByteOrder (ByteOrder(BigEndian,LittleEndian),targetByteOrder)
import GHC.Exts (Int(I#),Char(C#),Int#,State#,ByteArray#,(>=#))
import GHC.Exts (RealWorld,MutableByteArray#,(+#),(-#),(<#))
import GHC.Exts ((*#))
import GHC.Integer.Logarithms.Compat (integerLog2#)
import GHC.IO (IO(IO),stToIO)
import GHC.Natural (naturalFromInteger,naturalToInteger)
import GHC.ST (ST(ST))
import GHC.Word (Word(W#),Word8(W8#))
import Numeric.Natural (Natural)
import qualified Arithmetic.Nat as Nat
import qualified Arithmetic.Types as Arithmetic
@ -909,7 +923,6 @@ word32LE w = fromBounded Nat.constant (Bounded.word32LE w)
word16LE :: Word16 -> Builder
word16LE w = fromBounded Nat.constant (Bounded.word16LE w)
-- | Requires exactly 32 bytes. Dump the octets of a 256-bit
-- word in a big-endian fashion.
word256BE :: Word256 -> Builder
@ -1043,3 +1056,119 @@ indexChar8Array (ByteArray b) (I# i) = C# (Exts.indexCharArray# b i)
c2w :: Char -> Word8
c2w = fromIntegral . ord
-- In C, this is: (n << 1) ^ (n >> (BIT_WIDTH - 1))
zigZagNative :: Int -> Word
zigZagNative s = fromIntegral @Int @Word
((unsafeShiftL s 1) `xor` (unsafeShiftR s (finiteBitSize (undefined :: Word) - 1)))
-- | Encode a signed machine-sized integer with LEB-128. This uses
-- zig-zag encoding.
intLEB128 :: Int -> Builder
intLEB128 = wordLEB128 . zigZagNative
-- | Encode a machine-sized word with LEB-128.
wordLEB128 :: Word -> Builder
wordLEB128 w = fromBounded Nat.constant (Bounded.wordLEB128 w)
-- | Encode a 64-bit word with LEB-128.
word64LEB128 :: Word64 -> Builder
word64LEB128 w = fromBounded Nat.constant (Bounded.word64LEB128 w)
-- | Encode a signed arbitrary-precision integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
integerDec :: Integer -> Builder
integerDec !i
| i < 0 = ascii '-' <> naturalDec (naturalFromInteger (negate i))
| otherwise = naturalDec (naturalFromInteger i)
-- | Encodes an unsigned arbitrary-precision integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
naturalDec :: Natural -> Builder
naturalDec !n0 = fromEffect
(I# (11# +# (3# *# integerLog2# (naturalToInteger n0))))
(\marr off -> case n0 of
0 -> do
PM.writeByteArray marr off (0x30 :: Word8)
pure (off + 1)
_ -> go n0 marr off off
)
where
go :: forall s. Natural -> MutableByteArray s -> Int -> Int -> ST s Int
go !n !buf !off0 !off = case quotRem n 1_000_000_000 of
(q,r) -> case q of
0 -> do
off' <- backwardsWordLoop buf off (fromIntegral @Natural @Word r)
reverseBytes buf off0 (off' - 1)
pure off'
_ -> do
off' <- backwardsPasteWordPaddedDec9
(fromIntegral @Natural @Word r) buf off
go q buf off0 off'
-- Reverse the bytes in the designated slice. This takes
-- an inclusive start offset and an inclusive end offset.
reverseBytes :: MutableByteArray s -> Int -> Int -> ST s ()
{-# inline reverseBytes #-}
reverseBytes arr begin end = go begin end where
go ixA ixB = if ixA < ixB
then do
a :: Word8 <- PM.readByteArray arr ixA
b :: Word8 <- PM.readByteArray arr ixB
PM.writeByteArray arr ixA b
PM.writeByteArray arr ixB a
go (ixA + 1) (ixB - 1)
else pure ()
backwardsPasteWordPaddedDec9 ::
Word -> MutableByteArray s -> Int -> ST s Int
backwardsPasteWordPaddedDec9 !w !arr !off = do
backwardsPutRem10
(backwardsPutRem10 $ backwardsPutRem10 $ backwardsPutRem10 $
backwardsPutRem10 $ backwardsPutRem10 $ backwardsPutRem10 $
backwardsPutRem10 $ backwardsPutRem10
(\_ _ _ -> pure ())
) arr off w
pure (off + 9)
backwardsPutRem10 ::
(MutableByteArray s -> Int -> Word -> ST s a)
-> MutableByteArray s -> Int -> Word -> ST s a
{-# inline backwardsPutRem10 #-}
backwardsPutRem10 andThen arr off dividend = do
let quotient = approxDiv10 dividend
remainder = dividend - (10 * quotient)
PM.writeByteArray arr off (unsafeWordToWord8 (remainder + 48))
andThen arr (off + 1) quotient
backwardsWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
{-# inline backwardsWordLoop #-}
backwardsWordLoop arr off0 x0 = go off0 x0 where
go !off !(x :: Word) = if x > 0
then do
let (y,z) = quotRem x 10
PM.writeByteArray arr off (fromIntegral (z + 0x30) :: Word8)
go (off + 1) y
else pure off
-- Based on C code from https://stackoverflow.com/a/5558614
-- For numbers less than 1073741829, this gives a correct answer.
approxDiv10 :: Word -> Word
approxDiv10 !n = unsafeShiftR (0x1999999A * n) 32
-- -- A weird beast useful for rewrite rules. Not yet used. This will
-- -- ultimately replace fromEffect and fromBounded.
-- require :: Int -> Builder
-- require !n = Builder $ \buf0 off0 len0 cs0 s0 ->
-- let !(I# req) = n
-- in case len0 >=# req of
-- 1# -> (# s0, buf0, off0, len0, cs0 #)
-- _ -> let !(I# lenX) = max 4080 (I# req) in
-- case Exts.newByteArray# lenX s0 of
-- (# sX, bufX #) ->
-- (# sX, bufX, 0#, lenX, Mutable buf0 off0 cs0 #)
unsafeWordToWord8 :: Word -> Word8
unsafeWordToWord8 (W# w) = W8# w

View file

@ -45,6 +45,8 @@ module Data.Bytes.Builder.Bounded
-- ** 64-bit
, word64PaddedLowerHex
, word64PaddedUpperHex
-- ** 48-bit
, word48PaddedLowerHex
-- ** 32-bit
, word32PaddedLowerHex
, word32PaddedUpperHex
@ -89,6 +91,9 @@ module Data.Bytes.Builder.Bounded
, int64LE
, int32LE
, int16LE
-- **** LEB128
, wordLEB128
, word64LEB128
-- * Encode Floating-Point Types
, doubleDec
) where
@ -161,7 +166,10 @@ infixr 9 `append`
-- | Concatenate two builders.
append :: Builder m -> Builder n -> Builder (m + n)
append (Builder f) (Builder g) =
append = unsafeAppend
unsafeAppend :: Builder m -> Builder n -> Builder p
unsafeAppend (Builder f) (Builder g) =
Builder $ \arr off0 s0 -> case f arr off0 s0 of
(# s1, r #) -> g arr r s1
@ -286,15 +294,20 @@ wordCommonDec# w# = Unsafe.construct $ \arr off0 -> if w /= 0
internalWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
{-# inline internalWordLoop #-}
internalWordLoop arr off0 x0 = go off0 x0 where
internalWordLoop arr off0 x0 = do
off1 <- backwardsWordLoop arr off0 x0
reverseBytes arr off0 (off1 - 1)
pure off1
backwardsWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
{-# inline backwardsWordLoop #-}
backwardsWordLoop arr off0 x0 = go off0 x0 where
go !off !(x :: Word) = if x > 0
then do
let (y,z) = quotRem x 10
writeByteArray arr off (fromIntegral (z + 0x30) :: Word8)
go (off + 1) y
else do
reverseBytes arr off0 (off - 1)
pure off
else pure off
-- Requires up to 20 bytes. Can be less depending on what the
-- size of the argument is known to be. Unsafe.
@ -390,6 +403,14 @@ word64PaddedUpperHex (W64# w) = word64PaddedUpperHex# w
word64PaddedLowerHex :: Word64 -> Builder 16
word64PaddedLowerHex (W64# w) = word64PaddedLowerHex# w
-- | Requires exactly 12 bytes. Discards the upper 16 bits of a
-- 64-bit unsigned integer and then encodes the lower 48 bits as
-- hexadecimal, zero-padding the encoding to 12 digits. This uses
-- lowercase for the alphabetical digits. For example, this encodes the
-- number 1022 as @0000000003fe@.
word48PaddedLowerHex :: Word64 -> Builder 12
word48PaddedLowerHex (W64# w) = word48PaddedLowerHex# w
-- | Requires exactly 8 bytes. Encodes a 32-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 8 digits. This uses
-- uppercase for the alphabetical digits.
@ -481,6 +502,27 @@ word64PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
where
w = W# w#
-- TODO: Is it actually worth unrolling this loop. I suspect that it
-- might not be. Benchmark this.
word48PaddedLowerHex# :: Word# -> Builder 12
{-# noinline word48PaddedLowerHex# #-}
word48PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
writeByteArray arr off (toHexLower (unsafeShiftR w 44))
writeByteArray arr (off + 1) (toHexLower (unsafeShiftR w 40))
writeByteArray arr (off + 2) (toHexLower (unsafeShiftR w 36))
writeByteArray arr (off + 3) (toHexLower (unsafeShiftR w 32))
writeByteArray arr (off + 4) (toHexLower (unsafeShiftR w 28))
writeByteArray arr (off + 5) (toHexLower (unsafeShiftR w 24))
writeByteArray arr (off + 6) (toHexLower (unsafeShiftR w 20))
writeByteArray arr (off + 7) (toHexLower (unsafeShiftR w 16))
writeByteArray arr (off + 8) (toHexLower (unsafeShiftR w 12))
writeByteArray arr (off + 9) (toHexLower (unsafeShiftR w 8))
writeByteArray arr (off + 10) (toHexLower (unsafeShiftR w 4))
writeByteArray arr (off + 11) (toHexLower w)
pure (off + 12)
where
w = W# w#
-- TODO: Is it actually worth unrolling this loop. I suspect that it
-- might not be. Benchmark this.
word64PaddedLowerHex# :: Word# -> Builder 16
@ -752,6 +794,22 @@ ascii6 (C# c0) (C# c1) (C# c2) (C# c3) (C# c4) (C# c5) = Unsafe.construct $ \(Mu
primitive_ (writeCharArray# arr (off +# 5# ) c5)
pure (I# (off +# 6# ))
-- | Encode a machine-sized word with LEB-128.
wordLEB128 :: Word -> Builder 10
wordLEB128 (W# w) = lebCommon (W# w)
-- | Encode a 64-bit word with LEB-128.
word64LEB128 :: Word64 -> Builder 10
word64LEB128 (W64# w) = lebCommon (W# w)
lebCommon :: Word -> Builder n
lebCommon !w = case quotRem w 128 of
(q,r) -> case q of
0 -> unsafeWord8 (unsafeWordToWord8 r)
_ -> unsafeAppend
(unsafeWord8 (unsafeWordToWord8 (r .|. 0x80)))
(lebCommon q)
-- | Encode a character as UTF-8. This only uses as much space as is required.
char :: Char -> Builder 4
char c
@ -913,6 +971,11 @@ word8 w = Unsafe.construct $ \arr off -> do
writeByteArray arr off w
pure (off + 1)
unsafeWord8 :: Word8 -> Builder n
unsafeWord8 w = Unsafe.construct $ \arr off -> do
writeByteArray arr off w
pure (off + 1)
-- Reverse the bytes in the designated slice. This takes
-- an inclusive start offset and an inclusive end offset.
reverseBytes :: MutableByteArray s -> Int -> Int -> ST s ()

View file

@ -15,13 +15,17 @@ import Data.Char (ord,chr)
import Data.IORef (IORef,newIORef,readIORef,writeIORef)
import Data.Primitive (ByteArray)
import Data.WideWord (Word128(Word128),Word256(Word256))
import Numeric.Natural (Natural)
import Test.Tasty (defaultMain,testGroup,TestTree)
import Test.QuickCheck ((===),Arbitrary)
import Test.QuickCheck.Instances.Natural ()
import Text.Printf (printf)
import Test.Tasty.HUnit ((@=?))
import qualified Arithmetic.Nat as Nat
import qualified Data.Bits as Bits
import qualified Data.Bytes.Builder.Bounded as Bounded
import qualified Data.Bytes as Bytes
import qualified Data.ByteString as ByteString
import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Lazy.Char8 as LB
@ -215,6 +219,19 @@ tests = testGroup "Tests"
in runConcat 1 (foldMap word256BE xs)
===
runConcat 1 (word256ArrayBE ys 0 (Prelude.length xs))
, TQC.testProperty "word64LEB128" $ \(x :: Word64) ->
runConcat 1 (word64LEB128 x)
===
naiveLeb128 (fromIntegral x)
, TQC.testProperty "naturalDec-A" $ \(x :: Natural) ->
runConcat 1 (naturalDec x)
===
pack (show x)
, TQC.testProperty "naturalDec-B" $ \(x :: Natural) ->
let y = 1234567892345678934678987654321 * x in
runConcat 1 (naturalDec y)
===
pack (show y)
]
, testGroup "alternate"
[ TQC.testProperty "HexWord64" $ \x y ->
@ -306,3 +323,18 @@ zeroPadL :: Int -> String -> String
zeroPadL n s
| length s < n = replicate (n - length s) '0' ++ s
| otherwise = s
naiveLeb128 :: Natural -> ByteArray
naiveLeb128 x =
Bytes.toByteArray (Bytes.unsafeDrop 1 (Exts.fromList (0xFF : go [] x)))
where
go !xs !n =
let (q,r) = quotRem n 128
r' = fromIntegral @Natural @Word8 r
w = if q == 0
then r'
else Bits.setBit r' 7
xs' = w : xs
in if q == 0
then L.reverse xs'
else go xs' q