rhiannon morris
d9cdf1306d
IsReserved should be true for e.g. "λ" but not "fun", since only the first can show up in the lexer output
339 lines
9.7 KiB
Idris
339 lines
9.7 KiB
Idris
module Quox.Parser.Lexer
|
||
|
||
import Quox.CharExtra
|
||
import Quox.NatExtra
|
||
import Quox.Name
|
||
import Data.String.Extra
|
||
import Data.SortedMap
|
||
import public Data.String -- for singleton to reduce in IsReserved
|
||
import public Data.List.Elem
|
||
import public Text.Lexer
|
||
import public Text.Lexer.Tokenizer
|
||
import Derive.Prelude
|
||
%hide TT.Name
|
||
|
||
|
||
%default total
|
||
%language ElabReflection
|
||
|
||
|
||
||| @ Reserved reserved token
|
||
||| @ Name name, possibly qualified
|
||
||| @ Nat nat literal
|
||
||| @ Str string literal
|
||
||| @ Tag tag literal
|
||
||| @ TYPE "Type" or "★" with ascii nat directly after
|
||
||| @ Sup superscript or ^ number (displacement, or universe for ★)
|
||
public export
|
||
data Token =
|
||
Reserved String
|
||
| Name PName
|
||
| Nat Nat
|
||
| Str String
|
||
| Tag String
|
||
| TYPE Nat
|
||
| Sup Nat
|
||
%runElab derive "Token" [Eq, Ord, Show]
|
||
|
||
||| token or whitespace
|
||
||| @ Skip whitespace, comments, etc
|
||
||| @ Invalid a token which failed a post-lexer check
|
||
||| (e.g. a qualified name containing a keyword)
|
||
||| @ T a well formed token
|
||
public export
|
||
data ExtToken = Skip | Invalid String String | T Token
|
||
%runElab derive "ExtToken" [Eq, Ord, Show]
|
||
|
||
|
||
public export
|
||
data ErrorReason =
|
||
NoRuleApply
|
||
| ComposeNotClosing (Int, Int) (Int, Int)
|
||
| Other String
|
||
%runElab derive "ErrorReason" [Eq, Ord, Show]
|
||
|
||
public export
|
||
record Error where
|
||
constructor Err
|
||
reason : ErrorReason
|
||
line, col : Int
|
||
||| `Nothing` if the error is at the end of the input
|
||
char : Maybe Char
|
||
%runElab derive "StopReason" [Eq, Ord, Show]
|
||
%runElab derive "Error" [Eq, Ord, Show]
|
||
|
||
|
||
private
|
||
skip : Lexer -> Tokenizer ExtToken
|
||
skip t = match t $ const Skip
|
||
|
||
private
|
||
tmatch : Lexer -> (String -> Token) -> Tokenizer ExtToken
|
||
tmatch t f = match t (T . f)
|
||
|
||
|
||
export
|
||
fromStringLit : (String -> Token) -> String -> ExtToken
|
||
fromStringLit f str =
|
||
case go $ unpack $ drop 1 $ dropLast 1 str of
|
||
Left err => Invalid err str
|
||
Right ok => T $ f $ pack ok
|
||
where
|
||
Interpolation Char where interpolate = singleton
|
||
|
||
go, hexEscape : List Char -> Either String (List Char)
|
||
|
||
go [] = Right []
|
||
go ['\\'] = Left "string ends with \\"
|
||
go ('\\' :: 'n' :: cs) = ('\n' ::) <$> go cs
|
||
go ('\\' :: 't' :: cs) = ('\t' ::) <$> go cs
|
||
go ('\\' :: 'x' :: cs) = hexEscape cs
|
||
go ('\\' :: 'X' :: cs) = hexEscape cs
|
||
go ('\\' :: '\\' :: cs) = ('\\' ::) <$> go cs
|
||
go ('\\' :: '"' :: cs) = ('"' ::) <$> go cs
|
||
-- [todo] others
|
||
go ('\\' :: c :: _) = Left "unknown escape '\{c}'"
|
||
go (c :: cs) = (c ::) <$> go cs
|
||
|
||
hexEscape cs =
|
||
case break (== ';') cs of
|
||
(hs, ';' :: rest) => do
|
||
let hs = pack hs
|
||
let Just c = Int.fromHex hs
|
||
| Nothing => Left #"invalid hex string "\#{hs}" in escape"#
|
||
if isCodepoint c
|
||
then (chr c ::) <$> go (assert_smaller cs rest)
|
||
else Left "codepoint \{hs} out of range"
|
||
_ => Left "unterminated hex escape"
|
||
|
||
private
|
||
string : Tokenizer ExtToken
|
||
string = match stringLit $ fromStringLit Str
|
||
|
||
|
||
%hide binLit
|
||
%hide octLit
|
||
%hide hexLit
|
||
|
||
private
|
||
nat : Tokenizer ExtToken
|
||
nat = match hexLit fromHexLit
|
||
<|> tmatch decLit fromDecLit
|
||
where
|
||
withUnderscores : Lexer -> Lexer
|
||
withUnderscores l = l <+> many (opt (is '_') <+> l)
|
||
|
||
withoutUnderscores : String -> String
|
||
withoutUnderscores = pack . go . unpack where
|
||
go : List Char -> List Char
|
||
go [] = []
|
||
go ('_' :: cs) = go cs
|
||
go (c :: cs) = c :: go cs
|
||
|
||
decLit =
|
||
withUnderscores (range '0' '9') <+> reject idContEnd
|
||
|
||
hexLit =
|
||
approx "0x" <+>
|
||
withUnderscores (range '0' '9' <|> range 'a' 'f' <|> range 'A' 'F') <+>
|
||
reject idContEnd
|
||
|
||
fromDecLit : String -> Token
|
||
fromDecLit = Nat . cast . withoutUnderscores
|
||
|
||
fromHexLit : String -> ExtToken
|
||
fromHexLit str =
|
||
maybe (Invalid "invalid hex sequence" str) (T . Nat) $
|
||
fromHex $ withoutUnderscores $ drop 2 str
|
||
|
||
|
||
private
|
||
tag : Tokenizer ExtToken
|
||
tag = tmatch (is '\'' <+> name) (Tag . drop 1)
|
||
<|> match (is '\'' <+> stringLit) (fromStringLit Tag . drop 1)
|
||
|
||
|
||
private %inline
|
||
fromSup : Char -> Char
|
||
fromSup c = case c of
|
||
'⁰' => '0'; '¹' => '1'; '²' => '2'; '³' => '3'; '⁴' => '4'
|
||
'⁵' => '5'; '⁶' => '6'; '⁷' => '7'; '⁸' => '8'; '⁹' => '9'; _ => c
|
||
|
||
private %inline
|
||
supToNat : String -> Nat
|
||
supToNat = cast . pack . map fromSup . unpack
|
||
|
||
-- ★0, Type0. base ★/Type is a Reserved and ★¹/Type¹ are sequences of two tokens
|
||
private
|
||
universe : Tokenizer ExtToken
|
||
universe = universeWith "★" <|> universeWith "Type" where
|
||
universeWith : String -> Tokenizer ExtToken
|
||
universeWith pfx =
|
||
let len = length pfx in
|
||
tmatch (exact pfx <+> digits) (TYPE . cast . drop len)
|
||
|
||
private
|
||
sup : Tokenizer ExtToken
|
||
sup = tmatch (some $ pred isSupDigit) (Sup . supToNat)
|
||
<|> tmatch (is '^' <+> digits) (Sup . cast . drop 1)
|
||
|
||
|
||
private %inline
|
||
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
|
||
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
|
||
|
||
|
||
namespace Reserved
|
||
||| description of a reserved symbol
|
||
||| @ Word a reserved word (must not be followed by letters, digits, etc)
|
||
||| @ Sym a reserved symbol (must not be followed by symbolic chars)
|
||
||| @ Punc a character that doesn't show up in names (brackets, etc);
|
||
||| also a sequence ending in one of those, like `#[`, since the
|
||
||| difference relates to lookahead
|
||
public export
|
||
data Reserved1 = Word String | Sym String | Punc String
|
||
%runElab derive "Reserved1" [Eq, Ord, Show]
|
||
|
||
||| description of a token that might have unicode & ascii-only aliases
|
||
public export
|
||
data Reserved = Only Reserved1 | Or Reserved1 Reserved1
|
||
%runElab derive "Reserved" [Eq, Ord, Show]
|
||
|
||
public export
|
||
Sym1, Word1, Punc1 : String -> Reserved
|
||
Sym1 = Only . Sym
|
||
Word1 = Only . Word
|
||
Punc1 = Only . Punc
|
||
|
||
public export
|
||
resString1 : Reserved1 -> String
|
||
resString1 (Punc x) = x
|
||
resString1 (Word w) = w
|
||
resString1 (Sym s) = s
|
||
|
||
||| return the representative string for a token description. if there are
|
||
||| two, then it's the first one, which should be the full-unicode one
|
||
public export
|
||
resString : Reserved -> String
|
||
resString (Only r) = resString1 r
|
||
resString (r `Or` _) = resString1 r
|
||
|
||
||| return both representative strings for a token description
|
||
public export
|
||
resString2 : Reserved -> List String
|
||
resString2 (Only r) = [resString1 r]
|
||
resString2 (r `Or` s) = [resString1 r, resString1 s]
|
||
|
||
private
|
||
resTokenizer1 : Reserved1 -> String -> Tokenizer ExtToken
|
||
resTokenizer1 r str =
|
||
let res : String -> Token := const $ Reserved str in
|
||
case r of Word w => tmatch (exact w <+> reject idContEnd) res
|
||
Sym s => tmatch (exact s <+> reject symCont) res
|
||
Punc x => tmatch (exact x) res
|
||
|
||
||| match a reserved token
|
||
export
|
||
resTokenizer : Reserved -> Tokenizer ExtToken
|
||
resTokenizer (Only r) = resTokenizer1 r (resString1 r)
|
||
resTokenizer (r `Or` s) =
|
||
resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)
|
||
|
||
||| reserved words & symbols.
|
||
||| the tokens recognised by ``a `Or` b`` will be `Reserved a`.
|
||
||| e.g. `=>` in the input (if not part of a longer name)
|
||
||| will be returned as `Reserved "⇒"`.
|
||
public export
|
||
reserved : List Reserved
|
||
reserved =
|
||
[Punc1 "(", Punc1 ")", Punc1 "[", Punc1 "]", Punc1 "{", Punc1 "}",
|
||
Punc1 ",", Punc1 ";", Punc1 "#[",
|
||
Sym1 "@",
|
||
Sym1 ":",
|
||
Sym "⇒" `Or` Sym "=>",
|
||
Sym "→" `Or` Sym "->",
|
||
Sym "×" `Or` Sym "**",
|
||
Sym "≡" `Or` Sym "==",
|
||
Sym "∷" `Or` Sym "::",
|
||
Punc1 ".",
|
||
Word1 "case",
|
||
Word1 "case0", Word1 "case1",
|
||
Word "caseω" `Or` Word "case#",
|
||
Word1 "return",
|
||
Word1 "of",
|
||
Word1 "fst", Word1 "snd",
|
||
Word1 "_",
|
||
Word1 "Eq",
|
||
Word "λ" `Or` Word "fun",
|
||
Word "δ" `Or` Word "dfun",
|
||
Word "ω" `Or` Sym "#",
|
||
Sym "★" `Or` Word "Type",
|
||
Word "ℕ" `Or` Word "Nat",
|
||
Word1 "IOState",
|
||
Word1 "String",
|
||
Word1 "zero", Word1 "succ",
|
||
Word1 "coe", Word1 "comp",
|
||
Word1 "def",
|
||
Word1 "def0",
|
||
Word "defω" `Or` Word "def#",
|
||
Word1 "postulate",
|
||
Word1 "postulate0",
|
||
Word "postulateω" `Or` Word "postulate#",
|
||
Sym1 "=",
|
||
Word1 "load",
|
||
Word1 "namespace"]
|
||
|
||
public export
|
||
reservedStrings : List String
|
||
reservedStrings = map resString reserved
|
||
|
||
public export
|
||
allReservedStrings : List String
|
||
allReservedStrings = foldMap resString2 reserved
|
||
|
||
||| `IsReserved str` is true if `Reserved str` might actually show up in
|
||
||| the token stream
|
||
public export
|
||
IsReserved : String -> Type
|
||
IsReserved str = So (str `elem` reservedStrings)
|
||
|
||
private
|
||
name : Tokenizer ExtToken
|
||
name =
|
||
match name $ \str =>
|
||
let parts = split (== '.') $ normalizeNfc str in
|
||
case find (`elem` allReservedStrings) (toList parts) of
|
||
Nothing => T $ Name $ fromListP parts
|
||
Just w => Invalid "reserved word '\{w}' inside name \{str}" str
|
||
|
||
export
|
||
tokens : Tokenizer ExtToken
|
||
tokens = choice $
|
||
map skip [pred isWhitespace,
|
||
lineComment (exact "--" <+> reject symCont),
|
||
blockComment (exact "{-") (exact "-}")] <+>
|
||
[universe] <+> -- Type<i> takes precedence over bare Type
|
||
map resTokenizer reserved <+>
|
||
[sup, nat, string, tag, name]
|
||
|
||
export
|
||
check : Alternative f =>
|
||
WithBounds ExtToken -> Either Error (f (WithBounds Token))
|
||
check (MkBounded val irr bounds@(MkBounds line col _ _)) = case val of
|
||
Skip => Right empty
|
||
T tok => Right $ pure $ MkBounded tok irr bounds
|
||
Invalid msg tok => Left $ Err (Other msg) line col (index 0 tok)
|
||
|
||
export
|
||
toErrorReason : StopReason -> Maybe ErrorReason
|
||
toErrorReason EndInput = Nothing
|
||
toErrorReason NoRuleApply = Just NoRuleApply
|
||
toErrorReason (ComposeNotClosing s e) = Just $ ComposeNotClosing s e
|
||
|
||
export
|
||
lex : String -> Either Error (List (WithBounds Token))
|
||
lex str =
|
||
let (res, reason, line, col, str) = lex tokens str in
|
||
case toErrorReason reason of
|
||
Nothing => concatMap check res @{MonoidApplicative}
|
||
Just e => Left $ Err {reason = e, line, col, char = index 0 str}
|