string/nat lit stuff

This commit is contained in:
rhiannon morris 2023-11-05 15:38:13 +01:00
parent 3b9a339e5e
commit e211887a34
5 changed files with 146 additions and 41 deletions

View file

@ -166,3 +166,10 @@ isWhitespace ch = ch == '\t' || ch == '\r' || ch == '\n' || isSeparator ch
export
%foreign "scheme:string-normalize-nfc"
normalizeNfc : String -> String
export
isCodepoint : Int -> Bool
isCodepoint n =
n <= 0x10FFFF &&
not (n >= 0xD800 && n <= 0xDBFF || n >= 0xDC00 && n <= 0xDFFF)

View file

@ -4,6 +4,7 @@ import public Data.Nat
import Data.Nat.Division
import Data.SnocList
import Data.Vect
import Data.String
%default total
@ -52,6 +53,42 @@ parameters {base : Nat} {auto 0 _ : base `GTE` 2} (chars : Vect base Char)
showAtBase : Nat -> String
showAtBase = pack . showAtBase' []
export
showHex : Nat -> String
showHex = showAtBase $ fromList $ unpack "0123456789ABCDEF"
namespace Nat
export
showHex : Nat -> String
showHex = showAtBase $ fromList $ unpack "0123456789abcdef"
namespace Int
export
showHex : Int -> String
showHex x =
if x < 0 then "-" ++ Nat.showHex (cast (-x)) else Nat.showHex (cast x)
namespace Int
export
fromHexit : Char -> Maybe Int
fromHexit c =
if c >= '0' && c <= '9' then Just $ ord c - ord '0'
else if c >= 'a' && c <= 'f' then Just $ ord c - ord 'a' + 10
else if c >= 'A' && c <= 'F' then Just $ ord c - ord 'A' + 10
else Nothing
private
fromHex' : Int -> String -> Maybe Int
fromHex' acc str = case strM str of
StrNil => Just acc
StrCons c cs => fromHex' (16 * acc + !(fromHexit c)) (assert_smaller str cs)
export %inline
fromHex : String -> Maybe Int
fromHex = fromHex' 0
namespace Nat
export
fromHexit : Char -> Maybe Nat
fromHexit = map cast . Int.fromHexit
export %inline
fromHex : String -> Maybe Nat
fromHex = map cast . Int.fromHex

View file

@ -1,6 +1,7 @@
module Quox.Parser.Lexer
import Quox.CharExtra
import Quox.NatExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
@ -72,29 +73,84 @@ tmatch t f = match t (T . f)
export
fromStringLit : String -> String
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
go : List Char -> List Char
go [] = []
go ['\\'] = ['\\'] -- i guess???
go ('\\' :: 'n' :: cs) = '\n' :: go cs
go ('\\' :: 't' :: cs) = '\t' :: go cs
fromStringLit : (String -> Token) -> String -> ExtToken
fromStringLit f str =
case go $ unpack $ drop 1 $ dropLast 1 str of
Left err => Invalid err str
Right ok => T $ f $ pack ok
where
Interpolation Char where interpolate = singleton
go, hexEscape : List Char -> Either String (List Char)
go [] = Right []
go ['\\'] = Left "string ends with \\"
go ('\\' :: 'n' :: cs) = ('\n' ::) <$> go cs
go ('\\' :: 't' :: cs) = ('\t' ::) <$> go cs
go ('\\' :: 'x' :: cs) = hexEscape cs
go ('\\' :: 'X' :: cs) = hexEscape cs
go ('\\' :: '\\' :: cs) = ('\\' ::) <$> go cs
go ('\\' :: '"' :: cs) = ('"' ::) <$> go cs
-- [todo] others
go ('\\' :: c :: cs) = c :: go cs
go (c :: cs) = c :: go cs
go ('\\' :: c :: _) = Left "unknown escape '\{c}'"
go (c :: cs) = (c ::) <$> go cs
hexEscape cs =
case break (== ';') cs of
(hs, ';' :: rest) => do
let hs = pack hs
let Just c = Int.fromHex hs
| Nothing => Left #"invalid hex string "\#{hs}" in escape"#
if isCodepoint c
then (chr c ::) <$> go (assert_smaller cs rest)
else Left "codepoint \{hs} out of range"
_ => Left "unterminated hex escape"
private
string : Tokenizer ExtToken
string = tmatch stringLit (Str . fromStringLit)
string = match stringLit $ fromStringLit Str
%hide binLit
%hide octLit
%hide hexLit
private
nat : Tokenizer ExtToken
nat = tmatch (some (range '0' '9')) (Nat . cast)
nat = match hexLit fromHexLit
<|> tmatch decLit fromDecLit
where
withUnderscores : Lexer -> Lexer
withUnderscores l = l <+> many (opt (is '_') <+> l)
withoutUnderscores : String -> String
withoutUnderscores = pack . go . unpack where
go : List Char -> List Char
go [] = []
go ('_' :: cs) = go cs
go (c :: cs) = c :: go cs
decLit =
withUnderscores (range '0' '9') <+> reject idContEnd
hexLit =
approx "0x" <+>
withUnderscores (range '0' '9' <|> range 'a' 'f' <|> range 'A' 'F') <+>
reject idContEnd
fromDecLit : String -> Token
fromDecLit = Nat . cast . withoutUnderscores
fromHexLit : String -> ExtToken
fromHexLit str =
maybe (Invalid "invalid hex sequence" str) (T . Nat) $
fromHex $ withoutUnderscores $ drop 2 str
private
tag : Tokenizer ExtToken
tag = tmatch (is '\'' <+> name) (Tag . drop 1)
<|> tmatch (is '\'' <+> stringLit) (Tag . fromStringLit . drop 1)
<|> match (is '\'' <+> stringLit) (fromStringLit Tag . drop 1)

View file

@ -7,6 +7,7 @@ import Quox.Pretty
import Quox.EffExtra
import Quox.CharExtra
import Quox.NatExtra
import Data.DPair
import Data.List1
import Data.String
@ -278,28 +279,16 @@ prelude = """
;;;;;;
"""
export
toHex : Int -> String
toHex x =
case toHex' x of
[<] => "0"
cs => concatMap singleton cs
where
toHex' : Int -> SnocList Char
toHex' x =
if x == 0 then [<] else
let d = x `div` 16
m = x `mod` 16 in
toHex' (assert_smaller x d) :<
assert_total strIndex "0123456789abcdef" m
export
escape : String -> String
escape = concatMap esc1 . unpack where
escape = foldMap esc1 . unpack where
esc1 : Char -> String
esc1 c = if c < ' ' || c > '~' || c == '\\' || c == '"'
then "\\x" ++ toHex (ord c) ++ ";"
else singleton c
esc1 c =
if c == '\\' || c == '"' then
"\\" ++ singleton c
else if c < ' ' || c > '~' then
"\\x" ++ showHex (ord c) ++ ";"
else singleton c
export covering
defToScheme : Name -> Definition -> Eff Scheme (Maybe Sexp)

View file

@ -148,15 +148,31 @@ tests = "lexer" :- [
],
"strings" :- [
lexes #" "" "# [Str ""],
lexes #" "abc" "# [Str "abc"],
lexes #" "\"" "# [Str "\""],
lexes #" "\\" "# [Str "\\"],
lexes #" "\\\"" "# [Str "\\\""],
todo "other escapes"
lexes #" "" "# [Str ""],
lexes #" "abc" "# [Str "abc"],
lexes #" "\"" "# [Str "\""],
lexes #" "\\" "# [Str "\\"],
lexes #" "\\\"" "# [Str "\\\""],
lexes #" "\t" "# [Str "\t"],
lexes #" "\n" "# [Str "\n"],
lexes #" "🐉" "# [Str "🐉"],
lexes #" "\x1f409;" "# [Str "🐉"],
lexFail #" "\q" "#,
lexFail #" "\" "#
],
todo "naturals",
"naturals" :- [
lexes "0" [Nat 0],
lexes "123" [Nat 123],
lexes "69_420" [Nat 69420],
lexes "0x123" [Nat 0x123],
lexes "0xbeef" [Nat 0xbeef],
lexes "0xBEEF" [Nat 0xBEEF],
lexes "0XBEEF" [Nat 0xBEEF],
lexes "0xbaba_baba" [Nat 0xbabababa],
lexFail "123abc",
lexFail "0x123abcghi"
],
"universes" :- [
lexes "Type0" [TYPE 0],