Seven eights encoding

"7/8" encoding in two variants. Smoke tests for 7/8 encoding.

Co-authored-by: Eric Demko <edemko@layer3com.com>
This commit is contained in:
Zankoku Okuno 2022-01-31 09:41:05 -05:00 committed by GitHub
parent f5709a8cd2
commit f16f2120e3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 81 additions and 1 deletions

6
cabal.project Normal file
View file

@ -0,0 +1,6 @@
packages: .
source-repository-package
type: git
location: https://github.com/byteverse/zigzag
tag: 689fc7c852bf029af51333bcfffe3661c3276cf1

View file

@ -32,6 +32,9 @@ module Data.Bytes.Builder
, cstring#
, cstringLen
, stringUtf8
-- * Byte Sequence Encodings
, sevenEightRight
, sevenEightSmile
-- * Encode Integral Types
-- ** Human-Readable
, word64Dec
@ -135,7 +138,7 @@ import Prelude hiding (replicate)
import Control.Exception (SomeException,toException)
import Control.Monad.IO.Class (MonadIO,liftIO)
import Control.Monad.ST (ST,runST)
import Data.Bits (unsafeShiftR)
import Data.Bits ((.&.),(.|.),unsafeShiftL,unsafeShiftR)
import Data.Bytes.Builder.Unsafe (addCommitsLength,copyReverseCommits)
import Data.Bytes.Builder.Unsafe (Builder(Builder),commitDistance1)
import Data.Bytes.Builder.Unsafe (BuilderState(BuilderState),pasteIO)
@ -168,6 +171,7 @@ import Numeric.Natural (Natural)
import qualified Arithmetic.Nat as Nat
import qualified Arithmetic.Types as Arithmetic
import qualified Data.Bytes as Bytes
import qualified Data.Bytes.Builder.Bounded as Bounded
import qualified Data.Bytes.Builder.Bounded.Unsafe as UnsafeBounded
import qualified Data.Primitive as PM
@ -414,6 +418,66 @@ cstringLen (Exts.Ptr src#, I# slen# ) = Builder
where
!(I# newSz) = max (I# slen#) 4080
-- | Encode seven bytes into eight so that the encoded form is eight-bit clean.
-- Specifically segment the input bytes inot 7-bit groups (lowest-to-highest
-- index byte, most-to-least significant bit within a byte), pads the last group
-- with trailing zeros, and forms octects by prepending a zero to each group.
--
-- The name was chosen because this pads the input bits with zeros on the right,
-- and also because this was likely the originally-indended behavior of the
-- SMILE standard (see 'sevenEightSmile'). Right padding the input bits to a
-- multiple of seven, as in this variant, is consistent with base64 encodings
-- (which encodes 3 bytes in 4) and base85 (which encodes 4 bytes in 5).
sevenEightRight :: Bytes -> Builder
sevenEightRight bs0 = case toWord 0 0 bs0 of
(0, _) -> mempty
(len, w) -> go (len * 8) w <> sevenEightSmile (Bytes.unsafeDrop len bs0)
where
go :: Int -> Word64 -> Builder
go !nBits !_ | nBits <= 0 = mempty
go !nBits !w =
let octet = (fromIntegral $ unsafeShiftR w (8*7+1)) .&. 0x7f
in word8 octet <> go (nBits - 7) (unsafeShiftL w 7)
toWord :: Int -> Word64 -> Bytes -> (Int, Word64)
toWord !i !acc !bs
| Bytes.length bs == 0 = (i, acc)
| otherwise =
let b = fromIntegral @Word8 @Word64 $ Bytes.unsafeIndex bs 0
acc' = acc .|. unsafeShiftL b (fromIntegral $ 8 * (7 - i))
in if i < 7
then toWord (i + 1) acc' (Bytes.unsafeDrop 1 bs)
else (i, acc)
-- | Encode seven bytes into eight so that the encoded form is eight-bit clean.
-- Specifically segment the input bytes inot 7-bit groups (lowest-to-highest
-- index byte, most-to-least significant bit within a byte), then pad each group
-- with zeros on the left until each group is an octet.
--
-- The name was chosen because this is the implementation that is used (probably
-- unintentionally) in the reference SMILE implementation, and so is expected tp
-- be accepted by existing SMILE consumers.
sevenEightSmile :: Bytes -> Builder
sevenEightSmile bs0 = case toWord 0 0 bs0 of
(0, _) -> mempty
(len, w) -> go (len * 8) w <> sevenEightSmile (Bytes.unsafeDrop len bs0)
where
go :: Int -> Word64 -> Builder
go !nBits !w
| nBits == 0 = mempty
| nBits < 7 = go 7 (unsafeShiftR w (7 - nBits))
go !nBits !w =
let octet = (fromIntegral $ unsafeShiftR w (8*7+1)) .&. 0x7f
in word8 octet <> go (nBits - 7) (unsafeShiftL w 7)
toWord :: Int -> Word64 -> Bytes -> (Int, Word64)
toWord !i !acc !bs
| Bytes.length bs == 0 = (i, acc)
| otherwise =
let b = fromIntegral @Word8 @Word64 $ Bytes.unsafeIndex bs 0
acc' = acc .|. unsafeShiftL b (fromIntegral $ 8 * (7 - i))
in if i < 7
then toWord (i + 1) acc' (Bytes.unsafeDrop 1 bs)
else (i, acc)
-- | Create a builder from two byte sequences. This always results in two
-- calls to @memcpy@. This is beneficial when the byte sequences are
-- known to be small (less than 256 bytes).

View file

@ -255,6 +255,16 @@ tests = testGroup "Tests"
runConcat 1 (naturalDec y)
===
pack (show y)
, testGroup "seven/eight encoding"
[ THU.testCase "deadbeef" $ do
let inp = Latin1.fromString "\xDE\xAD\xBE\xEF"
(Chunks.concat . run 16) (sevenEightRight inp)
@=? Latin1.fromString "\x6F\x2B\x37\x6E\x78"
, THU.testCase "deadbeef-smile" $ do
let inp = Latin1.fromString "\xDE\xAD\xBE\xEF"
(Chunks.concat . run 16) (sevenEightSmile inp)
@=?Latin1.fromString "\x6F\x2B\x37\x6E\x0F"
]
]
, testGroup "alternate"
[ TQC.testProperty "HexWord64" $ \x y ->