Seven eights encoding

"7/8" encoding in two variants. Smoke tests for 7/8 encoding. Co-authored-by: Eric Demko <edemko@layer3com.com>
2022-01-31 09:41:05 -05:00 · 2022-01-31 09:41:05 -05:00 · f16f2120e3
commit f16f2120e3
parent f5709a8cd2
3 changed files with 81 additions and 1 deletions
--- a/cabal.project
+++ b/cabal.project
@ -0,0 +1,6 @@
+packages: .
+
+source-repository-package
+    type: git
+    location: https://github.com/byteverse/zigzag
+    tag: 689fc7c852bf029af51333bcfffe3661c3276cf1
--- a/src/Data/Bytes/Builder.hs
+++ b/src/Data/Bytes/Builder.hs
@ -32,6 +32,9 @@ module Data.Bytes.Builder
  , cstring#
  , cstringLen
  , stringUtf8
+    -- * Byte Sequence Encodings
+  , sevenEightRight
+  , sevenEightSmile
    -- * Encode Integral Types
    -- ** Human-Readable
  , word64Dec
@ -135,7 +138,7 @@ import Prelude hiding (replicate)
 import Control.Exception (SomeException,toException)
 import Control.Monad.IO.Class (MonadIO,liftIO)
 import Control.Monad.ST (ST,runST)
-import Data.Bits (unsafeShiftR)
+import Data.Bits ((.&.),(.|.),unsafeShiftL,unsafeShiftR)
 import Data.Bytes.Builder.Unsafe (addCommitsLength,copyReverseCommits)
 import Data.Bytes.Builder.Unsafe (Builder(Builder),commitDistance1)
 import Data.Bytes.Builder.Unsafe (BuilderState(BuilderState),pasteIO)
@ -168,6 +171,7 @@ import Numeric.Natural (Natural)

 import qualified Arithmetic.Nat as Nat
 import qualified Arithmetic.Types as Arithmetic
+import qualified Data.Bytes as Bytes
 import qualified Data.Bytes.Builder.Bounded as Bounded
 import qualified Data.Bytes.Builder.Bounded.Unsafe as UnsafeBounded
 import qualified Data.Primitive as PM
@ -414,6 +418,66 @@ cstringLen (Exts.Ptr src#, I# slen# ) = Builder
  where
  !(I# newSz) = max (I# slen#) 4080

+-- | Encode seven bytes into eight so that the encoded form is eight-bit clean.
+-- Specifically segment the input bytes inot 7-bit groups (lowest-to-highest
+-- index byte, most-to-least significant bit within a byte), pads the last group
+-- with trailing zeros, and forms octects by prepending a zero to each group.
+--
+-- The name was chosen because this pads the input bits with zeros on the right,
+-- and also because this was likely the originally-indended behavior of the
+-- SMILE standard (see 'sevenEightSmile'). Right padding the input bits to a
+-- multiple of seven, as in this variant, is consistent with base64 encodings
+-- (which encodes 3 bytes in 4) and base85 (which encodes 4 bytes in 5).
+sevenEightRight :: Bytes -> Builder
+sevenEightRight bs0 = case toWord 0 0 bs0 of
+  (0, _) -> mempty
+  (len, w) -> go (len * 8) w <> sevenEightSmile (Bytes.unsafeDrop len bs0)
+  where
+  go :: Int -> Word64 -> Builder
+  go !nBits !_ | nBits <= 0 = mempty
+  go !nBits !w =
+    let octet = (fromIntegral $ unsafeShiftR w (8*7+1)) .&. 0x7f
+     in word8 octet <> go (nBits - 7) (unsafeShiftL w 7)
+  toWord :: Int -> Word64 -> Bytes -> (Int, Word64)
+  toWord !i !acc !bs
+    | Bytes.length bs == 0 = (i, acc)
+    | otherwise =
+        let b = fromIntegral @Word8 @Word64 $ Bytes.unsafeIndex bs 0
+            acc' = acc .|. unsafeShiftL b (fromIntegral $ 8 * (7 - i))
+         in if i < 7
+            then toWord (i + 1) acc' (Bytes.unsafeDrop 1 bs)
+            else (i, acc)
+
+-- | Encode seven bytes into eight so that the encoded form is eight-bit clean.
+-- Specifically segment the input bytes inot 7-bit groups (lowest-to-highest
+-- index byte, most-to-least significant bit within a byte), then pad each group
+-- with zeros on the left until each group is an octet.
+--
+-- The name was chosen because this is the implementation that is used (probably
+-- unintentionally) in the reference SMILE implementation, and so is expected tp
+-- be accepted by existing SMILE consumers.
+sevenEightSmile :: Bytes -> Builder
+sevenEightSmile bs0 = case toWord 0 0 bs0 of
+  (0, _) -> mempty
+  (len, w) -> go (len * 8) w <> sevenEightSmile (Bytes.unsafeDrop len bs0)
+  where
+  go :: Int -> Word64 -> Builder
+  go !nBits !w
+    | nBits == 0 = mempty
+    | nBits < 7 = go 7 (unsafeShiftR w (7 - nBits))
+  go !nBits !w =
+    let octet = (fromIntegral $ unsafeShiftR w (8*7+1)) .&. 0x7f
+     in word8 octet <> go (nBits - 7) (unsafeShiftL w 7)
+  toWord :: Int -> Word64 -> Bytes -> (Int, Word64)
+  toWord !i !acc !bs
+    | Bytes.length bs == 0 = (i, acc)
+    | otherwise =
+        let b = fromIntegral @Word8 @Word64 $ Bytes.unsafeIndex bs 0
+            acc' = acc .|. unsafeShiftL b (fromIntegral $ 8 * (7 - i))
+         in if i < 7
+            then toWord (i + 1) acc' (Bytes.unsafeDrop 1 bs)
+            else (i, acc)
+
 -- | Create a builder from two byte sequences. This always results in two
 -- calls to @memcpy@. This is beneficial when the byte sequences are
 -- known to be small (less than 256 bytes).
--- a/test/Main.hs
+++ b/test/Main.hs
@ -255,6 +255,16 @@ tests = testGroup "Tests"
        runConcat 1 (naturalDec y)
        ===
        pack (show y)
+    , testGroup "seven/eight encoding"
+      [ THU.testCase "deadbeef" $ do
+        let inp = Latin1.fromString "\xDE\xAD\xBE\xEF"
+        (Chunks.concat . run 16) (sevenEightRight inp)
+          @=? Latin1.fromString "\x6F\x2B\x37\x6E\x78"
+      , THU.testCase "deadbeef-smile" $ do
+        let inp = Latin1.fromString "\xDE\xAD\xBE\xEF"
+        (Chunks.concat . run 16) (sevenEightSmile inp)
+          @=?Latin1.fromString "\x6F\x2B\x37\x6E\x0F"
+      ]
    ]
  , testGroup "alternate"
    [ TQC.testProperty "HexWord64" $ \x y ->