Seven eights encoding
"7/8" encoding in two variants. Smoke tests for 7/8 encoding. Co-authored-by: Eric Demko <edemko@layer3com.com>
This commit is contained in:
parent
f5709a8cd2
commit
f16f2120e3
3 changed files with 81 additions and 1 deletions
6
cabal.project
Normal file
6
cabal.project
Normal file
|
@ -0,0 +1,6 @@
|
|||
packages: .
|
||||
|
||||
source-repository-package
|
||||
type: git
|
||||
location: https://github.com/byteverse/zigzag
|
||||
tag: 689fc7c852bf029af51333bcfffe3661c3276cf1
|
|
@ -32,6 +32,9 @@ module Data.Bytes.Builder
|
|||
, cstring#
|
||||
, cstringLen
|
||||
, stringUtf8
|
||||
-- * Byte Sequence Encodings
|
||||
, sevenEightRight
|
||||
, sevenEightSmile
|
||||
-- * Encode Integral Types
|
||||
-- ** Human-Readable
|
||||
, word64Dec
|
||||
|
@ -135,7 +138,7 @@ import Prelude hiding (replicate)
|
|||
import Control.Exception (SomeException,toException)
|
||||
import Control.Monad.IO.Class (MonadIO,liftIO)
|
||||
import Control.Monad.ST (ST,runST)
|
||||
import Data.Bits (unsafeShiftR)
|
||||
import Data.Bits ((.&.),(.|.),unsafeShiftL,unsafeShiftR)
|
||||
import Data.Bytes.Builder.Unsafe (addCommitsLength,copyReverseCommits)
|
||||
import Data.Bytes.Builder.Unsafe (Builder(Builder),commitDistance1)
|
||||
import Data.Bytes.Builder.Unsafe (BuilderState(BuilderState),pasteIO)
|
||||
|
@ -168,6 +171,7 @@ import Numeric.Natural (Natural)
|
|||
|
||||
import qualified Arithmetic.Nat as Nat
|
||||
import qualified Arithmetic.Types as Arithmetic
|
||||
import qualified Data.Bytes as Bytes
|
||||
import qualified Data.Bytes.Builder.Bounded as Bounded
|
||||
import qualified Data.Bytes.Builder.Bounded.Unsafe as UnsafeBounded
|
||||
import qualified Data.Primitive as PM
|
||||
|
@ -414,6 +418,66 @@ cstringLen (Exts.Ptr src#, I# slen# ) = Builder
|
|||
where
|
||||
!(I# newSz) = max (I# slen#) 4080
|
||||
|
||||
-- | Encode seven bytes into eight so that the encoded form is eight-bit clean.
|
||||
-- Specifically segment the input bytes inot 7-bit groups (lowest-to-highest
|
||||
-- index byte, most-to-least significant bit within a byte), pads the last group
|
||||
-- with trailing zeros, and forms octects by prepending a zero to each group.
|
||||
--
|
||||
-- The name was chosen because this pads the input bits with zeros on the right,
|
||||
-- and also because this was likely the originally-indended behavior of the
|
||||
-- SMILE standard (see 'sevenEightSmile'). Right padding the input bits to a
|
||||
-- multiple of seven, as in this variant, is consistent with base64 encodings
|
||||
-- (which encodes 3 bytes in 4) and base85 (which encodes 4 bytes in 5).
|
||||
sevenEightRight :: Bytes -> Builder
|
||||
sevenEightRight bs0 = case toWord 0 0 bs0 of
|
||||
(0, _) -> mempty
|
||||
(len, w) -> go (len * 8) w <> sevenEightSmile (Bytes.unsafeDrop len bs0)
|
||||
where
|
||||
go :: Int -> Word64 -> Builder
|
||||
go !nBits !_ | nBits <= 0 = mempty
|
||||
go !nBits !w =
|
||||
let octet = (fromIntegral $ unsafeShiftR w (8*7+1)) .&. 0x7f
|
||||
in word8 octet <> go (nBits - 7) (unsafeShiftL w 7)
|
||||
toWord :: Int -> Word64 -> Bytes -> (Int, Word64)
|
||||
toWord !i !acc !bs
|
||||
| Bytes.length bs == 0 = (i, acc)
|
||||
| otherwise =
|
||||
let b = fromIntegral @Word8 @Word64 $ Bytes.unsafeIndex bs 0
|
||||
acc' = acc .|. unsafeShiftL b (fromIntegral $ 8 * (7 - i))
|
||||
in if i < 7
|
||||
then toWord (i + 1) acc' (Bytes.unsafeDrop 1 bs)
|
||||
else (i, acc)
|
||||
|
||||
-- | Encode seven bytes into eight so that the encoded form is eight-bit clean.
|
||||
-- Specifically segment the input bytes inot 7-bit groups (lowest-to-highest
|
||||
-- index byte, most-to-least significant bit within a byte), then pad each group
|
||||
-- with zeros on the left until each group is an octet.
|
||||
--
|
||||
-- The name was chosen because this is the implementation that is used (probably
|
||||
-- unintentionally) in the reference SMILE implementation, and so is expected tp
|
||||
-- be accepted by existing SMILE consumers.
|
||||
sevenEightSmile :: Bytes -> Builder
|
||||
sevenEightSmile bs0 = case toWord 0 0 bs0 of
|
||||
(0, _) -> mempty
|
||||
(len, w) -> go (len * 8) w <> sevenEightSmile (Bytes.unsafeDrop len bs0)
|
||||
where
|
||||
go :: Int -> Word64 -> Builder
|
||||
go !nBits !w
|
||||
| nBits == 0 = mempty
|
||||
| nBits < 7 = go 7 (unsafeShiftR w (7 - nBits))
|
||||
go !nBits !w =
|
||||
let octet = (fromIntegral $ unsafeShiftR w (8*7+1)) .&. 0x7f
|
||||
in word8 octet <> go (nBits - 7) (unsafeShiftL w 7)
|
||||
toWord :: Int -> Word64 -> Bytes -> (Int, Word64)
|
||||
toWord !i !acc !bs
|
||||
| Bytes.length bs == 0 = (i, acc)
|
||||
| otherwise =
|
||||
let b = fromIntegral @Word8 @Word64 $ Bytes.unsafeIndex bs 0
|
||||
acc' = acc .|. unsafeShiftL b (fromIntegral $ 8 * (7 - i))
|
||||
in if i < 7
|
||||
then toWord (i + 1) acc' (Bytes.unsafeDrop 1 bs)
|
||||
else (i, acc)
|
||||
|
||||
-- | Create a builder from two byte sequences. This always results in two
|
||||
-- calls to @memcpy@. This is beneficial when the byte sequences are
|
||||
-- known to be small (less than 256 bytes).
|
||||
|
|
10
test/Main.hs
10
test/Main.hs
|
@ -255,6 +255,16 @@ tests = testGroup "Tests"
|
|||
runConcat 1 (naturalDec y)
|
||||
===
|
||||
pack (show y)
|
||||
, testGroup "seven/eight encoding"
|
||||
[ THU.testCase "deadbeef" $ do
|
||||
let inp = Latin1.fromString "\xDE\xAD\xBE\xEF"
|
||||
(Chunks.concat . run 16) (sevenEightRight inp)
|
||||
@=? Latin1.fromString "\x6F\x2B\x37\x6E\x78"
|
||||
, THU.testCase "deadbeef-smile" $ do
|
||||
let inp = Latin1.fromString "\xDE\xAD\xBE\xEF"
|
||||
(Chunks.concat . run 16) (sevenEightSmile inp)
|
||||
@=?Latin1.fromString "\x6F\x2B\x37\x6E\x0F"
|
||||
]
|
||||
]
|
||||
, testGroup "alternate"
|
||||
[ TQC.testProperty "HexWord64" $ \x y ->
|
||||
|
|
Loading…
Reference in a new issue