quox/lib/Quox/Parser/Lexer.idr
rhiannon morris d9cdf1306d fix IsReserved
IsReserved should be true for e.g. "λ" but not "fun", since only the
first can show up in the lexer output
2023-11-05 15:43:20 +01:00

339 lines
9.7 KiB
Idris
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

module Quox.Parser.Lexer
import Quox.CharExtra
import Quox.NatExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
import public Data.String -- for singleton to reduce in IsReserved
import public Data.List.Elem
import public Text.Lexer
import public Text.Lexer.Tokenizer
import Derive.Prelude
%hide TT.Name
%default total
%language ElabReflection
||| @ Reserved reserved token
||| @ Name name, possibly qualified
||| @ Nat nat literal
||| @ Str string literal
||| @ Tag tag literal
||| @ TYPE "Type" or "★" with ascii nat directly after
||| @ Sup superscript or ^ number (displacement, or universe for ★)
public export
data Token =
Reserved String
| Name PName
| Nat Nat
| Str String
| Tag String
| TYPE Nat
| Sup Nat
%runElab derive "Token" [Eq, Ord, Show]
||| token or whitespace
||| @ Skip whitespace, comments, etc
||| @ Invalid a token which failed a post-lexer check
||| (e.g. a qualified name containing a keyword)
||| @ T a well formed token
public export
data ExtToken = Skip | Invalid String String | T Token
%runElab derive "ExtToken" [Eq, Ord, Show]
public export
data ErrorReason =
NoRuleApply
| ComposeNotClosing (Int, Int) (Int, Int)
| Other String
%runElab derive "ErrorReason" [Eq, Ord, Show]
public export
record Error where
constructor Err
reason : ErrorReason
line, col : Int
||| `Nothing` if the error is at the end of the input
char : Maybe Char
%runElab derive "StopReason" [Eq, Ord, Show]
%runElab derive "Error" [Eq, Ord, Show]
private
skip : Lexer -> Tokenizer ExtToken
skip t = match t $ const Skip
private
tmatch : Lexer -> (String -> Token) -> Tokenizer ExtToken
tmatch t f = match t (T . f)
export
fromStringLit : (String -> Token) -> String -> ExtToken
fromStringLit f str =
case go $ unpack $ drop 1 $ dropLast 1 str of
Left err => Invalid err str
Right ok => T $ f $ pack ok
where
Interpolation Char where interpolate = singleton
go, hexEscape : List Char -> Either String (List Char)
go [] = Right []
go ['\\'] = Left "string ends with \\"
go ('\\' :: 'n' :: cs) = ('\n' ::) <$> go cs
go ('\\' :: 't' :: cs) = ('\t' ::) <$> go cs
go ('\\' :: 'x' :: cs) = hexEscape cs
go ('\\' :: 'X' :: cs) = hexEscape cs
go ('\\' :: '\\' :: cs) = ('\\' ::) <$> go cs
go ('\\' :: '"' :: cs) = ('"' ::) <$> go cs
-- [todo] others
go ('\\' :: c :: _) = Left "unknown escape '\{c}'"
go (c :: cs) = (c ::) <$> go cs
hexEscape cs =
case break (== ';') cs of
(hs, ';' :: rest) => do
let hs = pack hs
let Just c = Int.fromHex hs
| Nothing => Left #"invalid hex string "\#{hs}" in escape"#
if isCodepoint c
then (chr c ::) <$> go (assert_smaller cs rest)
else Left "codepoint \{hs} out of range"
_ => Left "unterminated hex escape"
private
string : Tokenizer ExtToken
string = match stringLit $ fromStringLit Str
%hide binLit
%hide octLit
%hide hexLit
private
nat : Tokenizer ExtToken
nat = match hexLit fromHexLit
<|> tmatch decLit fromDecLit
where
withUnderscores : Lexer -> Lexer
withUnderscores l = l <+> many (opt (is '_') <+> l)
withoutUnderscores : String -> String
withoutUnderscores = pack . go . unpack where
go : List Char -> List Char
go [] = []
go ('_' :: cs) = go cs
go (c :: cs) = c :: go cs
decLit =
withUnderscores (range '0' '9') <+> reject idContEnd
hexLit =
approx "0x" <+>
withUnderscores (range '0' '9' <|> range 'a' 'f' <|> range 'A' 'F') <+>
reject idContEnd
fromDecLit : String -> Token
fromDecLit = Nat . cast . withoutUnderscores
fromHexLit : String -> ExtToken
fromHexLit str =
maybe (Invalid "invalid hex sequence" str) (T . Nat) $
fromHex $ withoutUnderscores $ drop 2 str
private
tag : Tokenizer ExtToken
tag = tmatch (is '\'' <+> name) (Tag . drop 1)
<|> match (is '\'' <+> stringLit) (fromStringLit Tag . drop 1)
private %inline
fromSup : Char -> Char
fromSup c = case c of
'' => '0'; '¹' => '1'; '²' => '2'; '³' => '3'; '' => '4'
'' => '5'; '' => '6'; '' => '7'; '' => '8'; '' => '9'; _ => c
private %inline
supToNat : String -> Nat
supToNat = cast . pack . map fromSup . unpack
-- ★0, Type0. base ★/Type is a Reserved and ★¹/Type¹ are sequences of two tokens
private
universe : Tokenizer ExtToken
universe = universeWith "" <|> universeWith "Type" where
universeWith : String -> Tokenizer ExtToken
universeWith pfx =
let len = length pfx in
tmatch (exact pfx <+> digits) (TYPE . cast . drop len)
private
sup : Tokenizer ExtToken
sup = tmatch (some $ pred isSupDigit) (Sup . supToNat)
<|> tmatch (is '^' <+> digits) (Sup . cast . drop 1)
private %inline
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
namespace Reserved
||| description of a reserved symbol
||| @ Word a reserved word (must not be followed by letters, digits, etc)
||| @ Sym a reserved symbol (must not be followed by symbolic chars)
||| @ Punc a character that doesn't show up in names (brackets, etc);
||| also a sequence ending in one of those, like `#[`, since the
||| difference relates to lookahead
public export
data Reserved1 = Word String | Sym String | Punc String
%runElab derive "Reserved1" [Eq, Ord, Show]
||| description of a token that might have unicode & ascii-only aliases
public export
data Reserved = Only Reserved1 | Or Reserved1 Reserved1
%runElab derive "Reserved" [Eq, Ord, Show]
public export
Sym1, Word1, Punc1 : String -> Reserved
Sym1 = Only . Sym
Word1 = Only . Word
Punc1 = Only . Punc
public export
resString1 : Reserved1 -> String
resString1 (Punc x) = x
resString1 (Word w) = w
resString1 (Sym s) = s
||| return the representative string for a token description. if there are
||| two, then it's the first one, which should be the full-unicode one
public export
resString : Reserved -> String
resString (Only r) = resString1 r
resString (r `Or` _) = resString1 r
||| return both representative strings for a token description
public export
resString2 : Reserved -> List String
resString2 (Only r) = [resString1 r]
resString2 (r `Or` s) = [resString1 r, resString1 s]
private
resTokenizer1 : Reserved1 -> String -> Tokenizer ExtToken
resTokenizer1 r str =
let res : String -> Token := const $ Reserved str in
case r of Word w => tmatch (exact w <+> reject idContEnd) res
Sym s => tmatch (exact s <+> reject symCont) res
Punc x => tmatch (exact x) res
||| match a reserved token
export
resTokenizer : Reserved -> Tokenizer ExtToken
resTokenizer (Only r) = resTokenizer1 r (resString1 r)
resTokenizer (r `Or` s) =
resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)
||| reserved words & symbols.
||| the tokens recognised by ``a `Or` b`` will be `Reserved a`.
||| e.g. `=>` in the input (if not part of a longer name)
||| will be returned as `Reserved "⇒"`.
public export
reserved : List Reserved
reserved =
[Punc1 "(", Punc1 ")", Punc1 "[", Punc1 "]", Punc1 "{", Punc1 "}",
Punc1 ",", Punc1 ";", Punc1 "#[",
Sym1 "@",
Sym1 ":",
Sym "" `Or` Sym "=>",
Sym "" `Or` Sym "->",
Sym "×" `Or` Sym "**",
Sym "" `Or` Sym "==",
Sym "" `Or` Sym "::",
Punc1 ".",
Word1 "case",
Word1 "case0", Word1 "case1",
Word "caseω" `Or` Word "case#",
Word1 "return",
Word1 "of",
Word1 "fst", Word1 "snd",
Word1 "_",
Word1 "Eq",
Word "λ" `Or` Word "fun",
Word "δ" `Or` Word "dfun",
Word "ω" `Or` Sym "#",
Sym "" `Or` Word "Type",
Word "" `Or` Word "Nat",
Word1 "IOState",
Word1 "String",
Word1 "zero", Word1 "succ",
Word1 "coe", Word1 "comp",
Word1 "def",
Word1 "def0",
Word "defω" `Or` Word "def#",
Word1 "postulate",
Word1 "postulate0",
Word "postulateω" `Or` Word "postulate#",
Sym1 "=",
Word1 "load",
Word1 "namespace"]
public export
reservedStrings : List String
reservedStrings = map resString reserved
public export
allReservedStrings : List String
allReservedStrings = foldMap resString2 reserved
||| `IsReserved str` is true if `Reserved str` might actually show up in
||| the token stream
public export
IsReserved : String -> Type
IsReserved str = So (str `elem` reservedStrings)
private
name : Tokenizer ExtToken
name =
match name $ \str =>
let parts = split (== '.') $ normalizeNfc str in
case find (`elem` allReservedStrings) (toList parts) of
Nothing => T $ Name $ fromListP parts
Just w => Invalid "reserved word '\{w}' inside name \{str}" str
export
tokens : Tokenizer ExtToken
tokens = choice $
map skip [pred isWhitespace,
lineComment (exact "--" <+> reject symCont),
blockComment (exact "{-") (exact "-}")] <+>
[universe] <+> -- Type<i> takes precedence over bare Type
map resTokenizer reserved <+>
[sup, nat, string, tag, name]
export
check : Alternative f =>
WithBounds ExtToken -> Either Error (f (WithBounds Token))
check (MkBounded val irr bounds@(MkBounds line col _ _)) = case val of
Skip => Right empty
T tok => Right $ pure $ MkBounded tok irr bounds
Invalid msg tok => Left $ Err (Other msg) line col (index 0 tok)
export
toErrorReason : StopReason -> Maybe ErrorReason
toErrorReason EndInput = Nothing
toErrorReason NoRuleApply = Just NoRuleApply
toErrorReason (ComposeNotClosing s e) = Just $ ComposeNotClosing s e
export
lex : String -> Either Error (List (WithBounds Token))
lex str =
let (res, reason, line, col, str) = lex tokens str in
case toErrorReason reason of
Nothing => concatMap check res @{MonoidApplicative}
Just e => Left $ Err {reason = e, line, col, char = index 0 str}