548 lines
16 KiB
Text
548 lines
16 KiB
Text
---
|
|
title: applying patches
|
|
...
|
|
|
|
stuff like rom hacks are usually distributed in one of two formats:
|
|
[IPS][] or [BPS][].
|
|
|
|
[IPS]: http://fileformats.archiveteam.org/wiki/IPS_(binary_patch_format)
|
|
[BPS]: https://github.com/blakesmith/rombp/blob/master/docs/bps_spec.md
|
|
|
|
> import Control.Applicative
|
|
> import Control.Monad
|
|
> import Control.Monad.Except
|
|
> import Control.Monad.State
|
|
> import Data.Bits
|
|
> import Data.ByteString (ByteString)
|
|
> import Data.ByteString qualified as BS
|
|
> import Data.Word
|
|
> import Numeric.Natural
|
|
|
|
and for tests:
|
|
|
|
<!--
|
|
#ifdef TESTS
|
|
-->
|
|
|
|
> import Numeric
|
|
> import Data.Either
|
|
> import Test.QuickCheck
|
|
> import GHC.Generics
|
|
|
|
<!--
|
|
#endif
|
|
-->
|
|
|
|
|
|
== basic parsers
|
|
|
|
i could just use attoparsec or something, and in the original code i did, but
|
|
that's not really needed for a program like this, so let's make our own. it's
|
|
good enough for these purposes. the error messages are bad, but what are you
|
|
going to do, hand-edit an IPS file???
|
|
|
|
a parser has read-write access to the input, a `ByteString`; and the ability to
|
|
throw errors (string messages are fine for this).
|
|
|
|
> newtype Parser a = Parser (StateT ByteString (Except String) a)
|
|
> deriving (Functor, Applicative, Alternative, Monad, MonadPlus)
|
|
|
|
when running a parser we want to check that if it was successful, then it did
|
|
actually consume all of the input. but not doing so, and returning the remainder
|
|
instead, will be useful for tests.
|
|
|
|
> runParserPartial :: Parser a -> ByteString -> Either String (a, ByteString)
|
|
> runParserPartial (Parser p) str = runExcept (runStateT p str)
|
|
>
|
|
> runParser :: Parser a -> ByteString -> Either String a
|
|
> runParser p str = do
|
|
> (result, rest) <- runParserPartial p str
|
|
> if BS.null rest then
|
|
> Right result
|
|
> else
|
|
> Left ("junk after EOF: " ++ show rest)
|
|
|
|
<details>
|
|
<summary>testing parser functions</summary>
|
|
<!--
|
|
#ifdef TESTS
|
|
-->
|
|
|
|
to test these parsers we can generate random inputs and check that they do the
|
|
right thing. a test input is a byte string, but printed with each byte in hex.
|
|
|
|
> newtype ParserInput = ParserInput { fromParserInput :: ByteString }
|
|
>
|
|
> instance Show ParserInput where
|
|
> show = unwords . map showHex2 . piToList
|
|
>
|
|
> showHex2 :: Word8 -> String
|
|
> showHex2 n = case showHex n "" of [c] -> ['0', c]; str -> str
|
|
>
|
|
> piToList :: ParserInput -> [Word8]
|
|
> piToList = BS.unpack . fromParserInput
|
|
>
|
|
> listToPI :: [Word8] -> ParserInput
|
|
> listToPI = ParserInput . BS.pack
|
|
|
|
like so:
|
|
|
|
```
|
|
ghci> ParserInput "\0\0\0"
|
|
00 00 00
|
|
ghci> ParserInput "Hello"
|
|
48 65 6c 6c 6f
|
|
```
|
|
|
|
in many cases, we don't really care about the structure of the input, so it can
|
|
just be a sequence of any bytes. quickcheck uses the `Arbitrary` class for
|
|
generating test input. the optional `shrink` function produces smaller but
|
|
related values, for trying to find a minimal test case. the instances for
|
|
`CoArbitrary` and `Function` is for generating functions which take
|
|
`ParserInput` as an argument.
|
|
|
|
> instance Arbitrary ParserInput where
|
|
> arbitrary :: Gen ParserInput
|
|
> arbitrary = listToPI <$> arbitrary
|
|
>
|
|
> shrink :: ParserInput -> [ParserInput]
|
|
> shrink = map listToPI . shrinkList (const []) . piToList
|
|
>
|
|
> instance CoArbitrary ParserInput where
|
|
> coarbitrary = coarbitrary . piToList
|
|
>
|
|
> instance Function ParserInput where
|
|
> function = functionMap piToList listToPI
|
|
|
|
in this case, `shrink` tries to remove some bytes from the string, without
|
|
touching individual bytes.
|
|
|
|
<!--
|
|
#endif
|
|
-->
|
|
</details>
|
|
|
|
when consuming a given number of characters, we check here that there is
|
|
actually enough input left.
|
|
|
|
> bytes :: Int -> Parser ByteString
|
|
> bytes len = Parser do
|
|
> (this, rest) <- BS.splitAt len <$> get
|
|
> if BS.length this == len then do
|
|
> put rest
|
|
> pure this
|
|
> else
|
|
> throwError "tried to read past end of input"
|
|
|
|
<details>
|
|
<summary>testing `bytes`</summary>
|
|
<!--
|
|
#ifdef TESTS
|
|
-->
|
|
|
|
for these tests, we need a generator for a string together with a valid
|
|
prefix length. if we tell quickcheck to just guess random numbers with no help,
|
|
it'll give up before running enough tests.
|
|
|
|
> data InputWithPrefix = ParserInput `WithPrefix` Int
|
|
> deriving (Show, Generic)
|
|
>
|
|
> instance Arbitrary InputWithPrefix where
|
|
> arbitrary = do
|
|
> input <- arbitrary
|
|
> len <- chooseInt (0, BS.length (fromParserInput input))
|
|
> pure (input `WithPrefix` len)
|
|
> shrink = genericShrink
|
|
|
|
|
|
now, the properties we want are:
|
|
|
|
if the remaining input is too short, then `bytes` should fail.
|
|
|
|
> prop_bytes_short (ParserInput str) (NonNegative count) =
|
|
> BS.length str < count ==> isLeft (runParserPartial (bytes count) str)
|
|
|
|
if there _is_ enough input, it should succeed.
|
|
|
|
> prop_bytes_enough (ParserInput str `WithPrefix` count) =
|
|
> isRight (runParserPartial (bytes count) str)
|
|
|
|
the returned string (the first element of the returned pair) should have the
|
|
right length.
|
|
|
|
> prop_bytes_length (ParserInput str `WithPrefix` count) =
|
|
> withSuccess (bytes count) str \(result, _) -> BS.length result === count
|
|
|
|
appending the returned string and the remainder should give the original input.
|
|
|
|
> prop_bytes_append (ParserInput str `WithPrefix` count) =
|
|
> withSuccess (bytes count) str \(result, rest) -> result <> rest === str
|
|
|
|
<!--
|
|
#endif
|
|
-->
|
|
</details>
|
|
|
|
it is also useful to take input while a given condition is true. in this case,
|
|
the parsing always succeeds but might be empty.
|
|
|
|
> bytesWhile :: (Word8 -> Bool) -> Parser ByteString
|
|
> bytesWhile cond = Parser do
|
|
> (this, rest) <- BS.span cond <$> get
|
|
> put rest; pure this
|
|
|
|
<details>
|
|
<summary>testing `bytesWhile`</summary>
|
|
<!--
|
|
#ifdef TESTS
|
|
-->
|
|
|
|
> withSuccess :: Testable p =>
|
|
> Parser a -> ByteString -> ((a, ByteString) -> p) -> Property
|
|
> withSuccess p str f =
|
|
> case runParserPartial p str of
|
|
> Left _ -> property False
|
|
> Right x -> property (f x)
|
|
|
|
`bytesWhile` always succeeds (with a possibly-empty result).
|
|
|
|
> prop_bytesWhile_succeed (Fn p) (ParserInput str) =
|
|
> isRight (runParserPartial (bytesWhile p) str)
|
|
|
|
`bytesWhile p` returns a value where `p` holds for each byte. this test also
|
|
categorises cases by result length, and requires the remainder to be non-empty,
|
|
to check that we're not accidentally creating only constant functions.
|
|
|
|
> prop_bytesWhile_result (Fn p) (ParserInput str) = do
|
|
> withSuccess (bytesWhile p) str \(result, rest) ->
|
|
> collect (BS.length result)
|
|
> (not (BS.null rest) ==> BS.all p result)
|
|
|
|
if `bytesWhile` doesn't consume the whole input, then `p` _doesn't_ hold for the
|
|
first byte of the remainder.
|
|
|
|
> prop_bytesWhile_rest (Fn p) (ParserInput str) =
|
|
> withSuccess (bytesWhile p) str \(_, rest) ->
|
|
> not (BS.null rest) ==> not (p (BS.head rest))
|
|
|
|
<!--
|
|
#endif
|
|
-->
|
|
</details>
|
|
|
|
|
|
checking for an exact string can be written in terms of `bytes`.
|
|
|
|
> exact :: ByteString -> Parser ()
|
|
> exact str = do
|
|
> prefix <- bytes (BS.length str)
|
|
> unless (prefix == str) do
|
|
> Parser (throwError ("expected " ++ show str ++ ", got " ++ show prefix))
|
|
|
|
to detect the end of input, it's simplest to just look at the remaining input
|
|
directly.
|
|
|
|
> eof :: Parser ()
|
|
> eof = Parser do
|
|
> input <- get
|
|
> unless (BS.null input) do
|
|
> throwError ("expected end of input, got " ++ show input)
|
|
|
|
reading an integer just takes the appropriate number of bytes and shifts them
|
|
together. **multi-byte integers in the IPS format are big-endian.**
|
|
|
|
> word8 :: Parser Word8
|
|
> word8 = BS.head <$> bytes 1
|
|
>
|
|
> word16BE :: Parser Word16
|
|
> word16BE = do
|
|
> hi <- word8; lo <- word8
|
|
> pure $ fromIntegral hi `shiftL` 8 .|. fromIntegral lo
|
|
>
|
|
> word24BE :: Parser Word32
|
|
> word24BE = do
|
|
> hi <- word16BE; lo <- word8
|
|
> pure $ fromIntegral hi `shiftL` 8 .|. fromIntegral lo
|
|
|
|
|
|
== IPS file format
|
|
|
|
the IPS format is extremely simple. it consists of the ASCII string `PATCH`, a
|
|
number of [chunks](#chunks), and finally the string `EOF`.
|
|
|
|
```{=html}
|
|
<figure class=shadowed>
|
|
<svg viewBox="-1 -1 602 32" width=602 height=32>
|
|
<style>
|
|
rect { stroke-width: 1px; stroke: currentcolor; }
|
|
text { text-anchor: middle; }
|
|
</style>
|
|
<g>
|
|
<rect width=15 height=30 fill="oklch(95% 15% 220deg)" />
|
|
<text x=7 y=22> P </text>
|
|
</g>
|
|
<g transform="translate(15 0)">
|
|
<rect width=15 height=30 fill="oklch(95% 15% 220deg)" />
|
|
<text x=7 y=22> A </text>
|
|
</g>
|
|
<g transform="translate(30 0)">
|
|
<rect width=15 height=30 fill="oklch(95% 15% 220deg)" />
|
|
<text x=7 y=22> T </text>
|
|
</g>
|
|
<g transform="translate(45 0)">
|
|
<rect width=15 height=30 fill="oklch(95% 15% 220deg)" />
|
|
<text x=7 y=22> C </text>
|
|
</g>
|
|
<g transform="translate(60 0)">
|
|
<rect width=15 height=30 fill="oklch(95% 15% 220deg)" />
|
|
<text x=7 y=22> H </text>
|
|
</g>
|
|
<g transform="translate(75 0)">
|
|
<rect width=80 height=30 fill="oklch(95% 15% 180deg)" />
|
|
<text x=40 y=22> Chunk1 </text>
|
|
</g>
|
|
<g transform="translate(155 0)">
|
|
<rect width=180 height=30 fill="oklch(95% 15% 160deg)" />
|
|
<text x=90 y=22> Chunk2 </text>
|
|
</g>
|
|
<g transform="translate(335 0)">
|
|
<rect width=120 height=30 fill="oklch(95% 15% 140deg)" />
|
|
<text x=60 y=22> Chunk3 </text>
|
|
</g>
|
|
<g transform="translate(455 0)">
|
|
<rect width=100 height=30 fill="oklch(95% 15% 120deg)" />
|
|
<text x=50 y=22> … </text>
|
|
</g>
|
|
<g transform="translate(555 0)">
|
|
<rect width=15 height=30 fill="oklch(95% 15% 100deg)" />
|
|
<text x=7 y=22> E </text>
|
|
</g>
|
|
<g transform="translate(570 0)">
|
|
<rect width=15 height=30 fill="oklch(95% 15% 100deg)" />
|
|
<text x=7 y=22> O </text>
|
|
</g>
|
|
<g transform="translate(585 0)">
|
|
<rect width=15 height=30 fill="oklch(95% 15% 100deg)" />
|
|
<text x=7 y=22> F </text>
|
|
</g>
|
|
</svg>
|
|
</figure>
|
|
```
|
|
|
|
> type IPS = [IpsChunk]
|
|
>
|
|
> ipsFile :: Parser IPS
|
|
> ipsFile = do exact "PATCH"; ipsChunks
|
|
|
|
if the rest of the input is just the `EOF` marker (followed by the actual end),
|
|
then we're done. otherwise, parse a chunk and repeat.
|
|
|
|
explicitly checking for the end of input as well means that chunks with an
|
|
offset of `0x454F46`, which will _look_ like `EOF`, are still handled
|
|
correctly.
|
|
|
|
> ipsChunks :: Parser [IpsChunk]
|
|
> ipsChunks = stop <|> go where
|
|
> stop = do exact "EOF"
|
|
> eof
|
|
> pure []
|
|
> go = do next <- ipsChunk
|
|
> rest <- ipsChunks
|
|
> pure (next : rest)
|
|
|
|
=== chunks
|
|
|
|
a chunk can either be a [run-length-encoded](#rle) or a [literal](#lit) chunk.
|
|
in either case, they are preceded by a 24-bit offset giving their start
|
|
position.
|
|
|
|
> data IpsChunk = IpsChunk { offset :: Word32, body :: IpsChunkBody }
|
|
>
|
|
> data IpsChunkBody
|
|
> = Lit ByteString
|
|
> | RLE { size :: Word16, value :: Word8 }
|
|
>
|
|
> ipsChunk :: Parser IpsChunk
|
|
> ipsChunk = IpsChunk <$> word24BE <*> (rle <|> lit)
|
|
|
|
a [**run-length encoded chunk**]{#rle} indicates a single byte being repeated a
|
|
given number of times. it is represented by two zero bytes, followed by a
|
|
two-byte length, and a single byte value. the zero bytes exist to distinguish it
|
|
from a literal chunk, which cannot have a zero length.
|
|
|
|
```{=html}
|
|
<figure class=shadowed>
|
|
<svg viewBox="-1 -1 642 32" width=642 height=32>
|
|
<style>
|
|
rect { stroke-width: 1px; stroke: currentcolor; }
|
|
text { text-anchor: middle }
|
|
</style>
|
|
<g>
|
|
<rect width=240 height=30 fill="oklch(95% 15% 80deg)" />
|
|
<text x=120 y=22> off (3) </text>
|
|
</g>
|
|
<g transform="translate(240 0)">
|
|
<rect width=160 height=30 fill="oklch(95% 15% 60deg)" />
|
|
<text x=80 y=22> "00" (2) </text>
|
|
</g>
|
|
<g transform="translate(400 0)">
|
|
<rect width=160 height=30 fill="oklch(95% 15% 40deg)" />
|
|
<text x=80 y=22> size (2) </text>
|
|
</g>
|
|
<g transform="translate(560 0)">
|
|
<rect width=80 height=30 fill="oklch(95% 15% 20deg)" />
|
|
<text x=40 y=22> val (1) </text>
|
|
</g>
|
|
</svg>
|
|
</figure>
|
|
```
|
|
|
|
> rle :: Parser IpsChunkBody
|
|
> rle = do
|
|
> exact [0,0]
|
|
> size <- word16BE
|
|
> value <- word8
|
|
> pure (RLE { size, value })
|
|
|
|
a [**literal chunk**]{#lit} consists of a 16-bit (non-zero) length, followed by
|
|
the data it contains. actually checking the size is non-zero isn't needed,
|
|
because `rle` is attempted first and will succeed in that case.
|
|
|
|
```{=html}
|
|
<figure class=shadowed>
|
|
<svg viewBox="-1 -1 642 32" width=642 height=32>
|
|
<style>
|
|
rect, path { stroke-width: 1px; stroke: currentcolor; }
|
|
text { text-anchor: middle; }
|
|
</style>
|
|
<g>
|
|
<rect width=240 height=30 fill="oklch(95% 15% 80deg)" />
|
|
<text x=120 y=22> off (3) </text>
|
|
</g>
|
|
<g transform="translate(240 0)">
|
|
<rect width=160 height=30 fill="oklch(95% 15% 60deg)" />
|
|
<text x=80 y=22> size (2) </text>
|
|
</g>
|
|
<g transform="translate(400 0)">
|
|
<path fill="oklch(95% 15% 40deg)"
|
|
d="M 240,0 h -240 v 30 h 240
|
|
l -5,-5 5,-5 -5,-5 5,-5 -5,-5 z" />
|
|
<text x=120 y=22> data (<tspan font-style=italic>size</tspan>) … </text>
|
|
</g>
|
|
</svg>
|
|
</figure>
|
|
```
|
|
|
|
> lit :: Parser IpsChunkBody
|
|
> lit = do
|
|
> size <- word16BE
|
|
> Lit <$> bytes (fromIntegral size)
|
|
|
|
|
|
== BPS file format
|
|
|
|
BPS uses arbitrary-size numbers using a variable-length, **little-endian**
|
|
encoding.
|
|
|
|
Each byte contains seven bits of data, using the most significant bit to
|
|
indicate whether the number continues. One extra quirk is that each non-final
|
|
byte is also decremented. the rationale for this is that otherwise, it would be
|
|
possible to encode `1` as `0x81` but also as `0x01_80`, with the meaning
|
|
`000 0000 000 0001` with extra zero padding. with this extra step, the second
|
|
encoding actually represents `000 0001 000 0001`, or 129.
|
|
|
|
so we read the input until reaching a byte with the MSB set, plus one more, then
|
|
shift them all together, right to left.
|
|
|
|
|
|
> bpsNumber :: Parser Natural
|
|
> bpsNumber = do
|
|
> rest <- bytesWhile \b -> not (testBit b 7)
|
|
> end <- word8
|
|
> pure (BS.foldr' combine (fromIntegral (clearBit end 7)) rest)
|
|
> where
|
|
> combine byte acc = fromIntegral byte .|. (acc + 1) `shiftL` 7
|
|
|
|
|
|
<details>
|
|
<summary>
|
|
testing `bpsNumber`
|
|
</summary>
|
|
<!--
|
|
#ifdef TESTS
|
|
-->
|
|
|
|
this function is _almost_ a nice fold, but i'm not yet 100% certain the `(+ 1)`
|
|
is in the right place. so here's a more literal translation of the C code:
|
|
|
|
> bpsNumberSpec :: Parser Natural
|
|
> bpsNumberSpec = loop 0 1 where
|
|
> loop acc shft = do
|
|
> byte <- word8
|
|
> let acc' = acc + fromIntegral (clearBit byte 7) * shft
|
|
> let shft' = shft `shiftL` 7
|
|
> if testBit byte 7 then
|
|
> pure acc'
|
|
> else
|
|
> loop (acc' + shft') shft'
|
|
|
|
a valid encoding of a number is a string of zero or more bytes with the MSB
|
|
unset, followed by one with it set.
|
|
|
|
> newtype BpsNumber = BpsNumber ByteString
|
|
> deriving Show via ParserInput
|
|
>
|
|
> instance Arbitrary BpsNumber where
|
|
> arbitrary = do
|
|
> rest :: [Word8] <- arbitrary
|
|
> final :: Word8 <- arbitrary
|
|
> let list = map (\b -> clearBit b 7) rest ++ [setBit final 7]
|
|
> pure (BpsNumber (BS.pack list))
|
|
|
|
both functions should accept any string of this format, and should each give the
|
|
same result.
|
|
|
|
> prop_bpsOk (BpsNumber e) =
|
|
> let bps' = runParser bpsNumberSpec e
|
|
> bps = runParser bpsNumber e in
|
|
> isRight bps' .&&. isRight bps .&&. bps' === bps
|
|
|
|
<!--
|
|
#endif
|
|
-->
|
|
</details>
|
|
|
|
|
|
> props :: [(String, Property)]
|
|
> props =
|
|
> [("prop_bytes_short", property prop_bytes_short),
|
|
> ("prop_bytes_enough", property prop_bytes_enough),
|
|
> ("prop_bytes_length", property prop_bytes_length),
|
|
> ("prop_bytes_append", property prop_bytes_append),
|
|
> ("prop_bytesWhile_succeed", property prop_bytesWhile_succeed),
|
|
> ("prop_bytesWhile_result", property prop_bytesWhile_result),
|
|
> ("prop_bytesWhile_rest", property prop_bytesWhile_rest),
|
|
> ("prop_bpsOk", property prop_bpsOk)]
|
|
>
|
|
> testAll :: IO ()
|
|
> testAll = forM_ props \(name, prop) -> do
|
|
> putStr (name ++ ": ")
|
|
> quickCheck prop
|
|
|
|
|
|
== applying a patch
|
|
|
|
|
|
> _outSize :: [IpsChunk] -> ByteString -> Int
|
|
> _outSize patch orig =
|
|
> foldl' (\s c -> max s (end c)) (BS.length orig) patch
|
|
> where
|
|
> end (IpsChunk offset body) = fromIntegral offset + size body
|
|
> size (Lit str) = BS.length str
|
|
> size (RLE sz _) = fromIntegral sz
|
|
|
|
|
|
|
|
|
|
> main :: IO ()
|
|
> main = testAll
|