Seven eights encoding
"7/8" encoding in two variants. Smoke tests for 7/8 encoding. Co-authored-by: Eric Demko <edemko@layer3com.com>
This commit is contained in:
parent
f5709a8cd2
commit
f16f2120e3
3 changed files with 81 additions and 1 deletions
6
cabal.project
Normal file
6
cabal.project
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
packages: .
|
||||||
|
|
||||||
|
source-repository-package
|
||||||
|
type: git
|
||||||
|
location: https://github.com/byteverse/zigzag
|
||||||
|
tag: 689fc7c852bf029af51333bcfffe3661c3276cf1
|
|
@ -32,6 +32,9 @@ module Data.Bytes.Builder
|
||||||
, cstring#
|
, cstring#
|
||||||
, cstringLen
|
, cstringLen
|
||||||
, stringUtf8
|
, stringUtf8
|
||||||
|
-- * Byte Sequence Encodings
|
||||||
|
, sevenEightRight
|
||||||
|
, sevenEightSmile
|
||||||
-- * Encode Integral Types
|
-- * Encode Integral Types
|
||||||
-- ** Human-Readable
|
-- ** Human-Readable
|
||||||
, word64Dec
|
, word64Dec
|
||||||
|
@ -135,7 +138,7 @@ import Prelude hiding (replicate)
|
||||||
import Control.Exception (SomeException,toException)
|
import Control.Exception (SomeException,toException)
|
||||||
import Control.Monad.IO.Class (MonadIO,liftIO)
|
import Control.Monad.IO.Class (MonadIO,liftIO)
|
||||||
import Control.Monad.ST (ST,runST)
|
import Control.Monad.ST (ST,runST)
|
||||||
import Data.Bits (unsafeShiftR)
|
import Data.Bits ((.&.),(.|.),unsafeShiftL,unsafeShiftR)
|
||||||
import Data.Bytes.Builder.Unsafe (addCommitsLength,copyReverseCommits)
|
import Data.Bytes.Builder.Unsafe (addCommitsLength,copyReverseCommits)
|
||||||
import Data.Bytes.Builder.Unsafe (Builder(Builder),commitDistance1)
|
import Data.Bytes.Builder.Unsafe (Builder(Builder),commitDistance1)
|
||||||
import Data.Bytes.Builder.Unsafe (BuilderState(BuilderState),pasteIO)
|
import Data.Bytes.Builder.Unsafe (BuilderState(BuilderState),pasteIO)
|
||||||
|
@ -168,6 +171,7 @@ import Numeric.Natural (Natural)
|
||||||
|
|
||||||
import qualified Arithmetic.Nat as Nat
|
import qualified Arithmetic.Nat as Nat
|
||||||
import qualified Arithmetic.Types as Arithmetic
|
import qualified Arithmetic.Types as Arithmetic
|
||||||
|
import qualified Data.Bytes as Bytes
|
||||||
import qualified Data.Bytes.Builder.Bounded as Bounded
|
import qualified Data.Bytes.Builder.Bounded as Bounded
|
||||||
import qualified Data.Bytes.Builder.Bounded.Unsafe as UnsafeBounded
|
import qualified Data.Bytes.Builder.Bounded.Unsafe as UnsafeBounded
|
||||||
import qualified Data.Primitive as PM
|
import qualified Data.Primitive as PM
|
||||||
|
@ -414,6 +418,66 @@ cstringLen (Exts.Ptr src#, I# slen# ) = Builder
|
||||||
where
|
where
|
||||||
!(I# newSz) = max (I# slen#) 4080
|
!(I# newSz) = max (I# slen#) 4080
|
||||||
|
|
||||||
|
-- | Encode seven bytes into eight so that the encoded form is eight-bit clean.
|
||||||
|
-- Specifically segment the input bytes inot 7-bit groups (lowest-to-highest
|
||||||
|
-- index byte, most-to-least significant bit within a byte), pads the last group
|
||||||
|
-- with trailing zeros, and forms octects by prepending a zero to each group.
|
||||||
|
--
|
||||||
|
-- The name was chosen because this pads the input bits with zeros on the right,
|
||||||
|
-- and also because this was likely the originally-indended behavior of the
|
||||||
|
-- SMILE standard (see 'sevenEightSmile'). Right padding the input bits to a
|
||||||
|
-- multiple of seven, as in this variant, is consistent with base64 encodings
|
||||||
|
-- (which encodes 3 bytes in 4) and base85 (which encodes 4 bytes in 5).
|
||||||
|
sevenEightRight :: Bytes -> Builder
|
||||||
|
sevenEightRight bs0 = case toWord 0 0 bs0 of
|
||||||
|
(0, _) -> mempty
|
||||||
|
(len, w) -> go (len * 8) w <> sevenEightSmile (Bytes.unsafeDrop len bs0)
|
||||||
|
where
|
||||||
|
go :: Int -> Word64 -> Builder
|
||||||
|
go !nBits !_ | nBits <= 0 = mempty
|
||||||
|
go !nBits !w =
|
||||||
|
let octet = (fromIntegral $ unsafeShiftR w (8*7+1)) .&. 0x7f
|
||||||
|
in word8 octet <> go (nBits - 7) (unsafeShiftL w 7)
|
||||||
|
toWord :: Int -> Word64 -> Bytes -> (Int, Word64)
|
||||||
|
toWord !i !acc !bs
|
||||||
|
| Bytes.length bs == 0 = (i, acc)
|
||||||
|
| otherwise =
|
||||||
|
let b = fromIntegral @Word8 @Word64 $ Bytes.unsafeIndex bs 0
|
||||||
|
acc' = acc .|. unsafeShiftL b (fromIntegral $ 8 * (7 - i))
|
||||||
|
in if i < 7
|
||||||
|
then toWord (i + 1) acc' (Bytes.unsafeDrop 1 bs)
|
||||||
|
else (i, acc)
|
||||||
|
|
||||||
|
-- | Encode seven bytes into eight so that the encoded form is eight-bit clean.
|
||||||
|
-- Specifically segment the input bytes inot 7-bit groups (lowest-to-highest
|
||||||
|
-- index byte, most-to-least significant bit within a byte), then pad each group
|
||||||
|
-- with zeros on the left until each group is an octet.
|
||||||
|
--
|
||||||
|
-- The name was chosen because this is the implementation that is used (probably
|
||||||
|
-- unintentionally) in the reference SMILE implementation, and so is expected tp
|
||||||
|
-- be accepted by existing SMILE consumers.
|
||||||
|
sevenEightSmile :: Bytes -> Builder
|
||||||
|
sevenEightSmile bs0 = case toWord 0 0 bs0 of
|
||||||
|
(0, _) -> mempty
|
||||||
|
(len, w) -> go (len * 8) w <> sevenEightSmile (Bytes.unsafeDrop len bs0)
|
||||||
|
where
|
||||||
|
go :: Int -> Word64 -> Builder
|
||||||
|
go !nBits !w
|
||||||
|
| nBits == 0 = mempty
|
||||||
|
| nBits < 7 = go 7 (unsafeShiftR w (7 - nBits))
|
||||||
|
go !nBits !w =
|
||||||
|
let octet = (fromIntegral $ unsafeShiftR w (8*7+1)) .&. 0x7f
|
||||||
|
in word8 octet <> go (nBits - 7) (unsafeShiftL w 7)
|
||||||
|
toWord :: Int -> Word64 -> Bytes -> (Int, Word64)
|
||||||
|
toWord !i !acc !bs
|
||||||
|
| Bytes.length bs == 0 = (i, acc)
|
||||||
|
| otherwise =
|
||||||
|
let b = fromIntegral @Word8 @Word64 $ Bytes.unsafeIndex bs 0
|
||||||
|
acc' = acc .|. unsafeShiftL b (fromIntegral $ 8 * (7 - i))
|
||||||
|
in if i < 7
|
||||||
|
then toWord (i + 1) acc' (Bytes.unsafeDrop 1 bs)
|
||||||
|
else (i, acc)
|
||||||
|
|
||||||
-- | Create a builder from two byte sequences. This always results in two
|
-- | Create a builder from two byte sequences. This always results in two
|
||||||
-- calls to @memcpy@. This is beneficial when the byte sequences are
|
-- calls to @memcpy@. This is beneficial when the byte sequences are
|
||||||
-- known to be small (less than 256 bytes).
|
-- known to be small (less than 256 bytes).
|
||||||
|
|
10
test/Main.hs
10
test/Main.hs
|
@ -255,6 +255,16 @@ tests = testGroup "Tests"
|
||||||
runConcat 1 (naturalDec y)
|
runConcat 1 (naturalDec y)
|
||||||
===
|
===
|
||||||
pack (show y)
|
pack (show y)
|
||||||
|
, testGroup "seven/eight encoding"
|
||||||
|
[ THU.testCase "deadbeef" $ do
|
||||||
|
let inp = Latin1.fromString "\xDE\xAD\xBE\xEF"
|
||||||
|
(Chunks.concat . run 16) (sevenEightRight inp)
|
||||||
|
@=? Latin1.fromString "\x6F\x2B\x37\x6E\x78"
|
||||||
|
, THU.testCase "deadbeef-smile" $ do
|
||||||
|
let inp = Latin1.fromString "\xDE\xAD\xBE\xEF"
|
||||||
|
(Chunks.concat . run 16) (sevenEightSmile inp)
|
||||||
|
@=?Latin1.fromString "\x6F\x2B\x37\x6E\x0F"
|
||||||
|
]
|
||||||
]
|
]
|
||||||
, testGroup "alternate"
|
, testGroup "alternate"
|
||||||
[ TQC.testProperty "HexWord64" $ \x y ->
|
[ TQC.testProperty "HexWord64" $ \x y ->
|
||||||
|
|
Loading…
Reference in a new issue