2023-03-12 13:28:37 -04:00
|
|
|
|
module Quox.Parser.Lexer
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
import Quox.CharExtra
|
2023-11-05 09:38:13 -05:00
|
|
|
|
import Quox.NatExtra
|
2023-02-28 14:51:54 -05:00
|
|
|
|
import Quox.Name
|
|
|
|
|
import Data.String.Extra
|
|
|
|
|
import Data.SortedMap
|
2023-03-06 06:04:43 -05:00
|
|
|
|
import public Data.String -- for singleton to reduce in IsReserved
|
2023-03-04 15:02:51 -05:00
|
|
|
|
import public Data.List.Elem
|
|
|
|
|
import public Text.Lexer
|
|
|
|
|
import public Text.Lexer.Tokenizer
|
2023-03-02 13:52:32 -05:00
|
|
|
|
import Derive.Prelude
|
2023-02-28 14:51:54 -05:00
|
|
|
|
%hide TT.Name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%default total
|
|
|
|
|
%language ElabReflection
|
|
|
|
|
|
|
|
|
|
|
2023-03-08 10:46:29 -05:00
|
|
|
|
||| @ Reserved reserved token
|
|
|
|
|
||| @ Name name, possibly qualified
|
|
|
|
|
||| @ Nat nat literal
|
2023-11-02 13:14:28 -04:00
|
|
|
|
||| @ Str string literal
|
2023-03-08 10:46:29 -05:00
|
|
|
|
||| @ Tag tag literal
|
2023-05-21 14:09:34 -04:00
|
|
|
|
||| @ TYPE "Type" or "★" with ascii nat directly after
|
|
|
|
|
||| @ Sup superscript or ^ number (displacement, or universe for ★)
|
2023-02-28 14:51:54 -05:00
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
data Token =
|
|
|
|
|
Reserved String
|
2023-03-12 13:28:37 -04:00
|
|
|
|
| Name PName
|
2023-03-08 10:46:29 -05:00
|
|
|
|
| Nat Nat
|
|
|
|
|
| Str String
|
|
|
|
|
| Tag String
|
|
|
|
|
| TYPE Nat
|
2023-05-21 14:09:34 -04:00
|
|
|
|
| Sup Nat
|
2023-03-02 13:52:32 -05:00
|
|
|
|
%runElab derive "Token" [Eq, Ord, Show]
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-09-24 11:36:20 -04:00
|
|
|
|
||| token or whitespace
|
|
|
|
|
||| @ Skip whitespace, comments, etc
|
|
|
|
|
||| @ Invalid a token which failed a post-lexer check
|
|
|
|
|
||| (e.g. a qualified name containing a keyword)
|
|
|
|
|
||| @ T a well formed token
|
2023-03-04 15:02:51 -05:00
|
|
|
|
public export
|
2023-09-24 11:36:20 -04:00
|
|
|
|
data ExtToken = Skip | Invalid String String | T Token
|
|
|
|
|
%runElab derive "ExtToken" [Eq, Ord, Show]
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
|
2023-09-24 11:36:20 -04:00
|
|
|
|
public export
|
|
|
|
|
data ErrorReason =
|
|
|
|
|
NoRuleApply
|
|
|
|
|
| ComposeNotClosing (Int, Int) (Int, Int)
|
|
|
|
|
| Other String
|
|
|
|
|
%runElab derive "ErrorReason" [Eq, Ord, Show]
|
|
|
|
|
|
2023-02-28 14:51:54 -05:00
|
|
|
|
public export
|
|
|
|
|
record Error where
|
|
|
|
|
constructor Err
|
2023-09-24 11:36:20 -04:00
|
|
|
|
reason : ErrorReason
|
2023-02-28 14:51:54 -05:00
|
|
|
|
line, col : Int
|
|
|
|
|
||| `Nothing` if the error is at the end of the input
|
|
|
|
|
char : Maybe Char
|
2023-03-02 13:52:32 -05:00
|
|
|
|
%runElab derive "StopReason" [Eq, Ord, Show]
|
|
|
|
|
%runElab derive "Error" [Eq, Ord, Show]
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-09-24 11:36:20 -04:00
|
|
|
|
skip : Lexer -> Tokenizer ExtToken
|
|
|
|
|
skip t = match t $ const Skip
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-09-24 11:36:20 -04:00
|
|
|
|
tmatch : Lexer -> (String -> Token) -> Tokenizer ExtToken
|
|
|
|
|
tmatch t f = match t (T . f)
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
|
2023-02-28 14:51:54 -05:00
|
|
|
|
export
|
2023-11-05 09:38:13 -05:00
|
|
|
|
fromStringLit : (String -> Token) -> String -> ExtToken
|
|
|
|
|
fromStringLit f str =
|
|
|
|
|
case go $ unpack $ drop 1 $ dropLast 1 str of
|
|
|
|
|
Left err => Invalid err str
|
|
|
|
|
Right ok => T $ f $ pack ok
|
|
|
|
|
where
|
|
|
|
|
Interpolation Char where interpolate = singleton
|
|
|
|
|
|
|
|
|
|
go, hexEscape : List Char -> Either String (List Char)
|
|
|
|
|
|
|
|
|
|
go [] = Right []
|
|
|
|
|
go ['\\'] = Left "string ends with \\"
|
|
|
|
|
go ('\\' :: 'n' :: cs) = ('\n' ::) <$> go cs
|
|
|
|
|
go ('\\' :: 't' :: cs) = ('\t' ::) <$> go cs
|
|
|
|
|
go ('\\' :: 'x' :: cs) = hexEscape cs
|
|
|
|
|
go ('\\' :: 'X' :: cs) = hexEscape cs
|
|
|
|
|
go ('\\' :: '\\' :: cs) = ('\\' ::) <$> go cs
|
|
|
|
|
go ('\\' :: '"' :: cs) = ('"' ::) <$> go cs
|
2023-11-03 15:07:59 -04:00
|
|
|
|
-- [todo] others
|
2023-11-05 09:38:13 -05:00
|
|
|
|
go ('\\' :: c :: _) = Left "unknown escape '\{c}'"
|
|
|
|
|
go (c :: cs) = (c ::) <$> go cs
|
|
|
|
|
|
|
|
|
|
hexEscape cs =
|
|
|
|
|
case break (== ';') cs of
|
|
|
|
|
(hs, ';' :: rest) => do
|
|
|
|
|
let hs = pack hs
|
|
|
|
|
let Just c = Int.fromHex hs
|
|
|
|
|
| Nothing => Left #"invalid hex string "\#{hs}" in escape"#
|
|
|
|
|
if isCodepoint c
|
|
|
|
|
then (chr c ::) <$> go (assert_smaller cs rest)
|
|
|
|
|
else Left "codepoint \{hs} out of range"
|
|
|
|
|
_ => Left "unterminated hex escape"
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-09-24 11:36:20 -04:00
|
|
|
|
string : Tokenizer ExtToken
|
2023-11-05 09:38:13 -05:00
|
|
|
|
string = match stringLit $ fromStringLit Str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%hide binLit
|
|
|
|
|
%hide octLit
|
|
|
|
|
%hide hexLit
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-09-24 11:36:20 -04:00
|
|
|
|
nat : Tokenizer ExtToken
|
2023-11-05 09:38:13 -05:00
|
|
|
|
nat = match hexLit fromHexLit
|
|
|
|
|
<|> tmatch decLit fromDecLit
|
|
|
|
|
where
|
|
|
|
|
withUnderscores : Lexer -> Lexer
|
|
|
|
|
withUnderscores l = l <+> many (opt (is '_') <+> l)
|
|
|
|
|
|
|
|
|
|
withoutUnderscores : String -> String
|
|
|
|
|
withoutUnderscores = pack . go . unpack where
|
|
|
|
|
go : List Char -> List Char
|
|
|
|
|
go [] = []
|
|
|
|
|
go ('_' :: cs) = go cs
|
|
|
|
|
go (c :: cs) = c :: go cs
|
|
|
|
|
|
|
|
|
|
decLit =
|
|
|
|
|
withUnderscores (range '0' '9') <+> reject idContEnd
|
|
|
|
|
|
|
|
|
|
hexLit =
|
|
|
|
|
approx "0x" <+>
|
|
|
|
|
withUnderscores (range '0' '9' <|> range 'a' 'f' <|> range 'A' 'F') <+>
|
|
|
|
|
reject idContEnd
|
|
|
|
|
|
|
|
|
|
fromDecLit : String -> Token
|
|
|
|
|
fromDecLit = Nat . cast . withoutUnderscores
|
|
|
|
|
|
|
|
|
|
fromHexLit : String -> ExtToken
|
|
|
|
|
fromHexLit str =
|
|
|
|
|
maybe (Invalid "invalid hex sequence" str) (T . Nat) $
|
|
|
|
|
fromHex $ withoutUnderscores $ drop 2 str
|
|
|
|
|
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-09-24 11:36:20 -04:00
|
|
|
|
tag : Tokenizer ExtToken
|
|
|
|
|
tag = tmatch (is '\'' <+> name) (Tag . drop 1)
|
2023-11-05 09:38:13 -05:00
|
|
|
|
<|> match (is '\'' <+> stringLit) (fromStringLit Tag . drop 1)
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
|
2023-05-21 14:09:34 -04:00
|
|
|
|
private %inline
|
|
|
|
|
fromSup : Char -> Char
|
|
|
|
|
fromSup c = case c of
|
|
|
|
|
'⁰' => '0'; '¹' => '1'; '²' => '2'; '³' => '3'; '⁴' => '4'
|
|
|
|
|
'⁵' => '5'; '⁶' => '6'; '⁷' => '7'; '⁸' => '8'; '⁹' => '9'; _ => c
|
|
|
|
|
|
|
|
|
|
private %inline
|
|
|
|
|
supToNat : String -> Nat
|
|
|
|
|
supToNat = cast . pack . map fromSup . unpack
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-11-05 09:41:21 -05:00
|
|
|
|
-- ★0, Type0. base ★/Type is a Reserved and ★¹/Type¹ are sequences of two tokens
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-09-24 11:36:20 -04:00
|
|
|
|
universe : Tokenizer ExtToken
|
2023-02-28 14:51:54 -05:00
|
|
|
|
universe = universeWith "★" <|> universeWith "Type" where
|
2023-09-24 11:36:20 -04:00
|
|
|
|
universeWith : String -> Tokenizer ExtToken
|
2023-02-28 14:51:54 -05:00
|
|
|
|
universeWith pfx =
|
|
|
|
|
let len = length pfx in
|
2023-09-24 11:36:20 -04:00
|
|
|
|
tmatch (exact pfx <+> digits) (TYPE . cast . drop len)
|
2023-05-21 14:09:34 -04:00
|
|
|
|
|
|
|
|
|
private
|
2023-09-24 11:36:20 -04:00
|
|
|
|
sup : Tokenizer ExtToken
|
|
|
|
|
sup = tmatch (some $ pred isSupDigit) (Sup . supToNat)
|
|
|
|
|
<|> tmatch (is '^' <+> digits) (Sup . cast . drop 1)
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private %inline
|
2023-02-28 14:51:54 -05:00
|
|
|
|
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
|
|
|
|
|
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
|
|
|
|
|
|
|
|
|
|
|
2023-03-08 10:46:29 -05:00
|
|
|
|
namespace Reserved
|
2023-03-04 15:02:51 -05:00
|
|
|
|
||| description of a reserved symbol
|
2023-03-08 10:46:29 -05:00
|
|
|
|
||| @ Word a reserved word (must not be followed by letters, digits, etc)
|
|
|
|
|
||| @ Sym a reserved symbol (must not be followed by symbolic chars)
|
2023-09-22 12:38:40 -04:00
|
|
|
|
||| @ Punc a character that doesn't show up in names (brackets, etc);
|
|
|
|
|
||| also a sequence ending in one of those, like `#[`, since the
|
|
|
|
|
||| difference relates to lookahead
|
2023-03-04 15:02:51 -05:00
|
|
|
|
public export
|
2023-09-22 12:38:40 -04:00
|
|
|
|
data Reserved1 = Word String | Sym String | Punc String
|
2023-03-08 10:46:29 -05:00
|
|
|
|
%runElab derive "Reserved1" [Eq, Ord, Show]
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
||| description of a token that might have unicode & ascii-only aliases
|
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
data Reserved = Only Reserved1 | Or Reserved1 Reserved1
|
|
|
|
|
%runElab derive "Reserved" [Eq, Ord, Show]
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
public export
|
2023-09-22 12:38:40 -04:00
|
|
|
|
Sym1, Word1, Punc1 : String -> Reserved
|
|
|
|
|
Sym1 = Only . Sym
|
2023-03-08 10:46:29 -05:00
|
|
|
|
Word1 = Only . Word
|
|
|
|
|
Punc1 = Only . Punc
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
resString1 : Reserved1 -> String
|
2023-09-22 12:38:40 -04:00
|
|
|
|
resString1 (Punc x) = x
|
2023-03-08 10:46:29 -05:00
|
|
|
|
resString1 (Word w) = w
|
|
|
|
|
resString1 (Sym s) = s
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
||| return the representative string for a token description. if there are
|
|
|
|
|
||| two, then it's the first one, which should be the full-unicode one
|
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
resString : Reserved -> String
|
2023-03-04 15:02:51 -05:00
|
|
|
|
resString (Only r) = resString1 r
|
|
|
|
|
resString (r `Or` _) = resString1 r
|
|
|
|
|
|
2023-09-24 11:36:20 -04:00
|
|
|
|
||| return both representative strings for a token description
|
|
|
|
|
public export
|
|
|
|
|
resString2 : Reserved -> List String
|
|
|
|
|
resString2 (Only r) = [resString1 r]
|
|
|
|
|
resString2 (r `Or` s) = [resString1 r, resString1 s]
|
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-09-24 11:36:20 -04:00
|
|
|
|
resTokenizer1 : Reserved1 -> String -> Tokenizer ExtToken
|
2023-03-04 15:02:51 -05:00
|
|
|
|
resTokenizer1 r str =
|
2023-03-08 10:46:29 -05:00
|
|
|
|
let res : String -> Token := const $ Reserved str in
|
2023-09-24 11:36:20 -04:00
|
|
|
|
case r of Word w => tmatch (exact w <+> reject idContEnd) res
|
|
|
|
|
Sym s => tmatch (exact s <+> reject symCont) res
|
|
|
|
|
Punc x => tmatch (exact x) res
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
||| match a reserved token
|
|
|
|
|
export
|
2023-09-24 11:36:20 -04:00
|
|
|
|
resTokenizer : Reserved -> Tokenizer ExtToken
|
2023-03-04 15:02:51 -05:00
|
|
|
|
resTokenizer (Only r) = resTokenizer1 r (resString1 r)
|
|
|
|
|
resTokenizer (r `Or` s) =
|
|
|
|
|
resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)
|
|
|
|
|
|
|
|
|
|
||| reserved words & symbols.
|
2023-03-08 10:46:29 -05:00
|
|
|
|
||| the tokens recognised by ``a `Or` b`` will be `Reserved a`.
|
|
|
|
|
||| e.g. `=>` in the input (if not part of a longer name)
|
|
|
|
|
||| will be returned as `Reserved "⇒"`.
|
2023-03-04 15:02:51 -05:00
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
reserved : List Reserved
|
2023-03-04 15:02:51 -05:00
|
|
|
|
reserved =
|
2023-09-22 12:38:40 -04:00
|
|
|
|
[Punc1 "(", Punc1 ")", Punc1 "[", Punc1 "]", Punc1 "{", Punc1 "}",
|
|
|
|
|
Punc1 ",", Punc1 ";", Punc1 "#[",
|
|
|
|
|
Sym1 "@",
|
2023-03-08 10:46:29 -05:00
|
|
|
|
Sym1 ":",
|
|
|
|
|
Sym "⇒" `Or` Sym "=>",
|
|
|
|
|
Sym "→" `Or` Sym "->",
|
|
|
|
|
Sym "×" `Or` Sym "**",
|
|
|
|
|
Sym "≡" `Or` Sym "==",
|
|
|
|
|
Sym "∷" `Or` Sym "::",
|
2023-09-22 12:38:40 -04:00
|
|
|
|
Punc1 ".",
|
2023-03-08 10:46:29 -05:00
|
|
|
|
Word1 "case",
|
2023-04-02 09:22:39 -04:00
|
|
|
|
Word1 "case0", Word1 "case1",
|
2023-03-08 10:46:29 -05:00
|
|
|
|
Word "caseω" `Or` Word "case#",
|
|
|
|
|
Word1 "return",
|
|
|
|
|
Word1 "of",
|
2023-09-18 15:52:51 -04:00
|
|
|
|
Word1 "fst", Word1 "snd",
|
2023-03-08 10:46:29 -05:00
|
|
|
|
Word1 "_",
|
|
|
|
|
Word1 "Eq",
|
|
|
|
|
Word "λ" `Or` Word "fun",
|
|
|
|
|
Word "δ" `Or` Word "dfun",
|
|
|
|
|
Word "ω" `Or` Sym "#",
|
|
|
|
|
Sym "★" `Or` Word "Type",
|
2023-03-26 08:40:54 -04:00
|
|
|
|
Word "ℕ" `Or` Word "Nat",
|
2023-11-01 10:17:15 -04:00
|
|
|
|
Word1 "IOState",
|
|
|
|
|
Word1 "String",
|
2023-03-26 08:40:54 -04:00
|
|
|
|
Word1 "zero", Word1 "succ",
|
2023-04-15 09:13:01 -04:00
|
|
|
|
Word1 "coe", Word1 "comp",
|
2023-03-08 10:46:29 -05:00
|
|
|
|
Word1 "def",
|
|
|
|
|
Word1 "def0",
|
|
|
|
|
Word "defω" `Or` Word "def#",
|
2023-11-01 07:56:27 -04:00
|
|
|
|
Word1 "postulate",
|
|
|
|
|
Word1 "postulate0",
|
|
|
|
|
Word "postulateω" `Or` Word "postulate#",
|
2023-03-17 16:54:09 -04:00
|
|
|
|
Sym1 "=",
|
2023-03-12 13:28:37 -04:00
|
|
|
|
Word1 "load",
|
|
|
|
|
Word1 "namespace"]
|
2023-03-08 10:46:29 -05:00
|
|
|
|
|
2023-09-24 11:36:20 -04:00
|
|
|
|
public export
|
|
|
|
|
reservedStrings : List String
|
|
|
|
|
reservedStrings = map resString reserved
|
|
|
|
|
|
|
|
|
|
public export
|
|
|
|
|
allReservedStrings : List String
|
|
|
|
|
allReservedStrings = foldMap resString2 reserved
|
|
|
|
|
|
2023-03-08 10:46:29 -05:00
|
|
|
|
||| `IsReserved str` is true if `Reserved str` might actually show up in
|
2023-03-04 15:02:51 -05:00
|
|
|
|
||| the token stream
|
|
|
|
|
public export
|
|
|
|
|
IsReserved : String -> Type
|
2023-11-05 09:43:17 -05:00
|
|
|
|
IsReserved str = So (str `elem` reservedStrings)
|
2023-09-24 11:36:20 -04:00
|
|
|
|
|
|
|
|
|
private
|
|
|
|
|
name : Tokenizer ExtToken
|
|
|
|
|
name =
|
|
|
|
|
match name $ \str =>
|
|
|
|
|
let parts = split (== '.') $ normalizeNfc str in
|
|
|
|
|
case find (`elem` allReservedStrings) (toList parts) of
|
|
|
|
|
Nothing => T $ Name $ fromListP parts
|
|
|
|
|
Just w => Invalid "reserved word '\{w}' inside name \{str}" str
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
2023-02-28 14:51:54 -05:00
|
|
|
|
export
|
2023-09-24 11:36:20 -04:00
|
|
|
|
tokens : Tokenizer ExtToken
|
2023-02-28 14:51:54 -05:00
|
|
|
|
tokens = choice $
|
|
|
|
|
map skip [pred isWhitespace,
|
|
|
|
|
lineComment (exact "--" <+> reject symCont),
|
|
|
|
|
blockComment (exact "{-") (exact "-}")] <+>
|
2023-11-05 09:41:21 -05:00
|
|
|
|
[universe] <+> -- Type<i> takes precedence over bare Type
|
2023-03-04 15:02:51 -05:00
|
|
|
|
map resTokenizer reserved <+>
|
2023-05-21 14:09:34 -04:00
|
|
|
|
[sup, nat, string, tag, name]
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-09-24 11:36:20 -04:00
|
|
|
|
export
|
|
|
|
|
check : Alternative f =>
|
|
|
|
|
WithBounds ExtToken -> Either Error (f (WithBounds Token))
|
|
|
|
|
check (MkBounded val irr bounds@(MkBounds line col _ _)) = case val of
|
|
|
|
|
Skip => Right empty
|
|
|
|
|
T tok => Right $ pure $ MkBounded tok irr bounds
|
|
|
|
|
Invalid msg tok => Left $ Err (Other msg) line col (index 0 tok)
|
|
|
|
|
|
|
|
|
|
export
|
|
|
|
|
toErrorReason : StopReason -> Maybe ErrorReason
|
|
|
|
|
toErrorReason EndInput = Nothing
|
|
|
|
|
toErrorReason NoRuleApply = Just NoRuleApply
|
|
|
|
|
toErrorReason (ComposeNotClosing s e) = Just $ ComposeNotClosing s e
|
|
|
|
|
|
2023-02-28 14:51:54 -05:00
|
|
|
|
export
|
|
|
|
|
lex : String -> Either Error (List (WithBounds Token))
|
|
|
|
|
lex str =
|
|
|
|
|
let (res, reason, line, col, str) = lex tokens str in
|
2023-09-24 11:36:20 -04:00
|
|
|
|
case toErrorReason reason of
|
|
|
|
|
Nothing => concatMap check res @{MonoidApplicative}
|
|
|
|
|
Just e => Left $ Err {reason = e, line, col, char = index 0 str}
|