From e211887a34d7a880c588a32ed9464ee022112c70 Mon Sep 17 00:00:00 2001 From: rhiannon morris Date: Sun, 5 Nov 2023 15:38:13 +0100 Subject: [PATCH] string/nat lit stuff --- lib/Quox/CharExtra.idr | 7 ++++ lib/Quox/NatExtra.idr | 43 ++++++++++++++++++-- lib/Quox/Parser/Lexer.idr | 80 +++++++++++++++++++++++++++++++------ lib/Quox/Untyped/Scheme.idr | 27 ++++--------- tests/Tests/Lexer.idr | 30 ++++++++++---- 5 files changed, 146 insertions(+), 41 deletions(-) diff --git a/lib/Quox/CharExtra.idr b/lib/Quox/CharExtra.idr index a2a05a3..48c1fab 100644 --- a/lib/Quox/CharExtra.idr +++ b/lib/Quox/CharExtra.idr @@ -166,3 +166,10 @@ isWhitespace ch = ch == '\t' || ch == '\r' || ch == '\n' || isSeparator ch export %foreign "scheme:string-normalize-nfc" normalizeNfc : String -> String + + +export +isCodepoint : Int -> Bool +isCodepoint n = + n <= 0x10FFFF && + not (n >= 0xD800 && n <= 0xDBFF || n >= 0xDC00 && n <= 0xDFFF) diff --git a/lib/Quox/NatExtra.idr b/lib/Quox/NatExtra.idr index 42f627e..64248ec 100644 --- a/lib/Quox/NatExtra.idr +++ b/lib/Quox/NatExtra.idr @@ -4,6 +4,7 @@ import public Data.Nat import Data.Nat.Division import Data.SnocList import Data.Vect +import Data.String %default total @@ -52,6 +53,42 @@ parameters {base : Nat} {auto 0 _ : base `GTE` 2} (chars : Vect base Char) showAtBase : Nat -> String showAtBase = pack . showAtBase' [] -export -showHex : Nat -> String -showHex = showAtBase $ fromList $ unpack "0123456789ABCDEF" +namespace Nat + export + showHex : Nat -> String + showHex = showAtBase $ fromList $ unpack "0123456789abcdef" + +namespace Int + export + showHex : Int -> String + showHex x = + if x < 0 then "-" ++ Nat.showHex (cast (-x)) else Nat.showHex (cast x) + + +namespace Int + export + fromHexit : Char -> Maybe Int + fromHexit c = + if c >= '0' && c <= '9' then Just $ ord c - ord '0' + else if c >= 'a' && c <= 'f' then Just $ ord c - ord 'a' + 10 + else if c >= 'A' && c <= 'F' then Just $ ord c - ord 'A' + 10 + else Nothing + + private + fromHex' : Int -> String -> Maybe Int + fromHex' acc str = case strM str of + StrNil => Just acc + StrCons c cs => fromHex' (16 * acc + !(fromHexit c)) (assert_smaller str cs) + + export %inline + fromHex : String -> Maybe Int + fromHex = fromHex' 0 + +namespace Nat + export + fromHexit : Char -> Maybe Nat + fromHexit = map cast . Int.fromHexit + + export %inline + fromHex : String -> Maybe Nat + fromHex = map cast . Int.fromHex diff --git a/lib/Quox/Parser/Lexer.idr b/lib/Quox/Parser/Lexer.idr index 7ef791f..c18c4a5 100644 --- a/lib/Quox/Parser/Lexer.idr +++ b/lib/Quox/Parser/Lexer.idr @@ -1,6 +1,7 @@ module Quox.Parser.Lexer import Quox.CharExtra +import Quox.NatExtra import Quox.Name import Data.String.Extra import Data.SortedMap @@ -72,29 +73,84 @@ tmatch t f = match t (T . f) export -fromStringLit : String -> String -fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where - go : List Char -> List Char - go [] = [] - go ['\\'] = ['\\'] -- i guess??? - go ('\\' :: 'n' :: cs) = '\n' :: go cs - go ('\\' :: 't' :: cs) = '\t' :: go cs +fromStringLit : (String -> Token) -> String -> ExtToken +fromStringLit f str = + case go $ unpack $ drop 1 $ dropLast 1 str of + Left err => Invalid err str + Right ok => T $ f $ pack ok +where + Interpolation Char where interpolate = singleton + + go, hexEscape : List Char -> Either String (List Char) + + go [] = Right [] + go ['\\'] = Left "string ends with \\" + go ('\\' :: 'n' :: cs) = ('\n' ::) <$> go cs + go ('\\' :: 't' :: cs) = ('\t' ::) <$> go cs + go ('\\' :: 'x' :: cs) = hexEscape cs + go ('\\' :: 'X' :: cs) = hexEscape cs + go ('\\' :: '\\' :: cs) = ('\\' ::) <$> go cs + go ('\\' :: '"' :: cs) = ('"' ::) <$> go cs -- [todo] others - go ('\\' :: c :: cs) = c :: go cs - go (c :: cs) = c :: go cs + go ('\\' :: c :: _) = Left "unknown escape '\{c}'" + go (c :: cs) = (c ::) <$> go cs + + hexEscape cs = + case break (== ';') cs of + (hs, ';' :: rest) => do + let hs = pack hs + let Just c = Int.fromHex hs + | Nothing => Left #"invalid hex string "\#{hs}" in escape"# + if isCodepoint c + then (chr c ::) <$> go (assert_smaller cs rest) + else Left "codepoint \{hs} out of range" + _ => Left "unterminated hex escape" private string : Tokenizer ExtToken -string = tmatch stringLit (Str . fromStringLit) +string = match stringLit $ fromStringLit Str + + +%hide binLit +%hide octLit +%hide hexLit private nat : Tokenizer ExtToken -nat = tmatch (some (range '0' '9')) (Nat . cast) +nat = match hexLit fromHexLit + <|> tmatch decLit fromDecLit +where + withUnderscores : Lexer -> Lexer + withUnderscores l = l <+> many (opt (is '_') <+> l) + + withoutUnderscores : String -> String + withoutUnderscores = pack . go . unpack where + go : List Char -> List Char + go [] = [] + go ('_' :: cs) = go cs + go (c :: cs) = c :: go cs + + decLit = + withUnderscores (range '0' '9') <+> reject idContEnd + + hexLit = + approx "0x" <+> + withUnderscores (range '0' '9' <|> range 'a' 'f' <|> range 'A' 'F') <+> + reject idContEnd + + fromDecLit : String -> Token + fromDecLit = Nat . cast . withoutUnderscores + + fromHexLit : String -> ExtToken + fromHexLit str = + maybe (Invalid "invalid hex sequence" str) (T . Nat) $ + fromHex $ withoutUnderscores $ drop 2 str + private tag : Tokenizer ExtToken tag = tmatch (is '\'' <+> name) (Tag . drop 1) - <|> tmatch (is '\'' <+> stringLit) (Tag . fromStringLit . drop 1) + <|> match (is '\'' <+> stringLit) (fromStringLit Tag . drop 1) diff --git a/lib/Quox/Untyped/Scheme.idr b/lib/Quox/Untyped/Scheme.idr index 61ca482..36fbdd4 100644 --- a/lib/Quox/Untyped/Scheme.idr +++ b/lib/Quox/Untyped/Scheme.idr @@ -7,6 +7,7 @@ import Quox.Pretty import Quox.EffExtra import Quox.CharExtra +import Quox.NatExtra import Data.DPair import Data.List1 import Data.String @@ -278,28 +279,16 @@ prelude = """ ;;;;;; """ -export -toHex : Int -> String -toHex x = - case toHex' x of - [<] => "0" - cs => concatMap singleton cs -where - toHex' : Int -> SnocList Char - toHex' x = - if x == 0 then [<] else - let d = x `div` 16 - m = x `mod` 16 in - toHex' (assert_smaller x d) :< - assert_total strIndex "0123456789abcdef" m - export escape : String -> String -escape = concatMap esc1 . unpack where +escape = foldMap esc1 . unpack where esc1 : Char -> String - esc1 c = if c < ' ' || c > '~' || c == '\\' || c == '"' - then "\\x" ++ toHex (ord c) ++ ";" - else singleton c + esc1 c = + if c == '\\' || c == '"' then + "\\" ++ singleton c + else if c < ' ' || c > '~' then + "\\x" ++ showHex (ord c) ++ ";" + else singleton c export covering defToScheme : Name -> Definition -> Eff Scheme (Maybe Sexp) diff --git a/tests/Tests/Lexer.idr b/tests/Tests/Lexer.idr index 910a434..499aab4 100644 --- a/tests/Tests/Lexer.idr +++ b/tests/Tests/Lexer.idr @@ -148,15 +148,31 @@ tests = "lexer" :- [ ], "strings" :- [ - lexes #" "" "# [Str ""], - lexes #" "abc" "# [Str "abc"], - lexes #" "\"" "# [Str "\""], - lexes #" "\\" "# [Str "\\"], - lexes #" "\\\"" "# [Str "\\\""], - todo "other escapes" + lexes #" "" "# [Str ""], + lexes #" "abc" "# [Str "abc"], + lexes #" "\"" "# [Str "\""], + lexes #" "\\" "# [Str "\\"], + lexes #" "\\\"" "# [Str "\\\""], + lexes #" "\t" "# [Str "\t"], + lexes #" "\n" "# [Str "\n"], + lexes #" "🐉" "# [Str "🐉"], + lexes #" "\x1f409;" "# [Str "🐉"], + lexFail #" "\q" "#, + lexFail #" "\" "# ], - todo "naturals", + "naturals" :- [ + lexes "0" [Nat 0], + lexes "123" [Nat 123], + lexes "69_420" [Nat 69420], + lexes "0x123" [Nat 0x123], + lexes "0xbeef" [Nat 0xbeef], + lexes "0xBEEF" [Nat 0xBEEF], + lexes "0XBEEF" [Nat 0xBEEF], + lexes "0xbaba_baba" [Nat 0xbabababa], + lexFail "123abc", + lexFail "0x123abcghi" + ], "universes" :- [ lexes "Type0" [TYPE 0],