2023-03-12 13:28:37 -04:00
|
|
|
|
module Quox.Parser.Lexer
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
import Quox.CharExtra
|
|
|
|
|
import Quox.Name
|
|
|
|
|
import Data.String.Extra
|
|
|
|
|
import Data.SortedMap
|
2023-03-06 06:04:43 -05:00
|
|
|
|
import public Data.String -- for singleton to reduce in IsReserved
|
2023-03-04 15:02:51 -05:00
|
|
|
|
import public Data.List.Elem
|
|
|
|
|
import public Text.Lexer
|
|
|
|
|
import public Text.Lexer.Tokenizer
|
2023-03-02 13:52:32 -05:00
|
|
|
|
import Derive.Prelude
|
2023-02-28 14:51:54 -05:00
|
|
|
|
%hide TT.Name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%default total
|
|
|
|
|
%language ElabReflection
|
|
|
|
|
|
|
|
|
|
|
2023-03-08 10:46:29 -05:00
|
|
|
|
||| @ Reserved reserved token
|
|
|
|
|
||| @ Name name, possibly qualified
|
|
|
|
|
||| @ Nat nat literal
|
|
|
|
|
||| @ String string literal
|
|
|
|
|
||| @ Tag tag literal
|
2023-02-28 14:51:54 -05:00
|
|
|
|
||| @ TYPE "Type" or "★" with subscript
|
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
data Token =
|
|
|
|
|
Reserved String
|
2023-03-12 13:28:37 -04:00
|
|
|
|
| Name PName
|
2023-03-08 10:46:29 -05:00
|
|
|
|
| Nat Nat
|
|
|
|
|
| Str String
|
|
|
|
|
| Tag String
|
|
|
|
|
| TYPE Nat
|
2023-03-02 13:52:32 -05:00
|
|
|
|
%runElab derive "Token" [Eq, Ord, Show]
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
-- token or whitespace
|
2023-03-04 15:02:51 -05:00
|
|
|
|
public export
|
2023-02-28 14:51:54 -05:00
|
|
|
|
0 TokenW : Type
|
|
|
|
|
TokenW = Maybe Token
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public export
|
|
|
|
|
record Error where
|
|
|
|
|
constructor Err
|
|
|
|
|
reason : StopReason
|
|
|
|
|
line, col : Int
|
|
|
|
|
||| `Nothing` if the error is at the end of the input
|
|
|
|
|
char : Maybe Char
|
2023-03-02 13:52:32 -05:00
|
|
|
|
%runElab derive "StopReason" [Eq, Ord, Show]
|
|
|
|
|
%runElab derive "Error" [Eq, Ord, Show]
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-02-28 14:51:54 -05:00
|
|
|
|
skip : Lexer -> Tokenizer TokenW
|
|
|
|
|
skip t = match t $ const Nothing
|
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
|
|
|
|
match : Lexer -> (String -> Token) -> Tokenizer TokenW
|
|
|
|
|
match t f = Tokenizer.match t (Just . f)
|
|
|
|
|
%hide Tokenizer.match
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export %inline
|
2023-02-28 14:51:54 -05:00
|
|
|
|
syntaxChars : List Char
|
2023-03-04 15:02:51 -05:00
|
|
|
|
syntaxChars = ['(', ')', '[', ']', '{', '}', '"', '\'', ',', '.', ';']
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
|
|
|
|
stra, isSymCont : Char -> Bool
|
2023-02-28 14:51:54 -05:00
|
|
|
|
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
|
|
|
|
|
isSymCont c = c == '\'' || isSymStart c
|
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
|
|
|
|
idStart, idCont, idEnd, idContEnd : Lexer
|
2023-02-28 14:51:54 -05:00
|
|
|
|
idStart = pred isIdStart
|
|
|
|
|
idCont = pred isIdCont
|
|
|
|
|
idEnd = pred $ \c => c `elem` unpack "?!#"
|
|
|
|
|
idContEnd = idCont <|> idEnd
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
private
|
|
|
|
|
symStart, symCont : Lexer
|
2023-02-28 14:51:54 -05:00
|
|
|
|
symStart = pred isSymStart
|
|
|
|
|
symCont = pred isSymCont
|
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
|
|
|
|
baseNameL : Lexer
|
|
|
|
|
baseNameL = idStart <+> many idCont <+> many idEnd
|
|
|
|
|
<|> symStart <+> many symCont
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
private
|
2023-03-04 15:02:51 -05:00
|
|
|
|
nameL : Lexer
|
|
|
|
|
nameL = baseNameL <+> many (is '.' <+> baseNameL)
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-02-28 14:51:54 -05:00
|
|
|
|
name : Tokenizer TokenW
|
2023-03-12 13:28:37 -04:00
|
|
|
|
name = match nameL $ Name . fromListP . split (== '.') . normalizeNfc
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
||| [todo] escapes other than `\"` and (accidentally) `\\`
|
2023-02-28 14:51:54 -05:00
|
|
|
|
export
|
|
|
|
|
fromStringLit : String -> String
|
|
|
|
|
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
|
|
|
|
|
go : List Char -> List Char
|
|
|
|
|
go [] = []
|
|
|
|
|
go ['\\'] = ['\\'] -- i guess???
|
|
|
|
|
go ('\\' :: c :: cs) = c :: go cs
|
|
|
|
|
go (c :: cs) = c :: go cs
|
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-02-28 14:51:54 -05:00
|
|
|
|
string : Tokenizer TokenW
|
2023-03-08 10:46:29 -05:00
|
|
|
|
string = match stringLit (Str . fromStringLit)
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-02-28 14:51:54 -05:00
|
|
|
|
nat : Tokenizer TokenW
|
2023-03-08 10:46:29 -05:00
|
|
|
|
nat = match (some (range '0' '9')) (Nat . cast)
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-02-28 14:51:54 -05:00
|
|
|
|
tag : Tokenizer TokenW
|
2023-03-08 10:46:29 -05:00
|
|
|
|
tag = match (is '\'' <+> nameL) (Tag . drop 1)
|
|
|
|
|
<|> match (is '\'' <+> stringLit) (Tag . fromStringLit . drop 1)
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private %inline
|
|
|
|
|
fromSub : Char -> Char
|
|
|
|
|
fromSub c = case c of
|
|
|
|
|
'₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4'
|
|
|
|
|
'₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c
|
|
|
|
|
|
|
|
|
|
private %inline
|
|
|
|
|
subToNat : String -> Nat
|
|
|
|
|
subToNat = cast . pack . map fromSub . unpack
|
|
|
|
|
|
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private
|
2023-02-28 14:51:54 -05:00
|
|
|
|
universe : Tokenizer TokenW
|
|
|
|
|
universe = universeWith "★" <|> universeWith "Type" where
|
|
|
|
|
universeWith : String -> Tokenizer TokenW
|
|
|
|
|
universeWith pfx =
|
|
|
|
|
let len = length pfx in
|
|
|
|
|
match (exact pfx <+> some (range '0' '9'))
|
2023-03-04 15:02:51 -05:00
|
|
|
|
(TYPE . cast . drop len) <|>
|
2023-02-28 14:51:54 -05:00
|
|
|
|
match (exact pfx <+> some (range '₀' '₉'))
|
2023-03-04 15:02:51 -05:00
|
|
|
|
(TYPE . subToNat . drop len)
|
2023-02-28 14:51:54 -05:00
|
|
|
|
|
|
|
|
|
|
2023-03-04 15:02:51 -05:00
|
|
|
|
private %inline
|
2023-02-28 14:51:54 -05:00
|
|
|
|
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
|
|
|
|
|
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
|
|
|
|
|
|
|
|
|
|
|
2023-03-08 10:46:29 -05:00
|
|
|
|
namespace Reserved
|
2023-03-04 15:02:51 -05:00
|
|
|
|
||| description of a reserved symbol
|
2023-03-08 10:46:29 -05:00
|
|
|
|
||| @ Word a reserved word (must not be followed by letters, digits, etc)
|
|
|
|
|
||| @ Sym a reserved symbol (must not be followed by symbolic chars)
|
|
|
|
|
||| @ Punc a character that doesn't show up in names (brackets, etc)
|
2023-03-04 15:02:51 -05:00
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
data Reserved1 = Word String | Sym String | Punc Char
|
|
|
|
|
%runElab derive "Reserved1" [Eq, Ord, Show]
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
||| description of a token that might have unicode & ascii-only aliases
|
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
data Reserved = Only Reserved1 | Or Reserved1 Reserved1
|
|
|
|
|
%runElab derive "Reserved" [Eq, Ord, Show]
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
Sym1, Word1 : String -> Reserved
|
|
|
|
|
Sym1 = Only . Sym
|
|
|
|
|
Word1 = Only . Word
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
Punc1 : Char -> Reserved
|
|
|
|
|
Punc1 = Only . Punc
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
resString1 : Reserved1 -> String
|
|
|
|
|
resString1 (Punc x) = singleton x
|
|
|
|
|
resString1 (Word w) = w
|
|
|
|
|
resString1 (Sym s) = s
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
||| return the representative string for a token description. if there are
|
|
|
|
|
||| two, then it's the first one, which should be the full-unicode one
|
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
resString : Reserved -> String
|
2023-03-04 15:02:51 -05:00
|
|
|
|
resString (Only r) = resString1 r
|
|
|
|
|
resString (r `Or` _) = resString1 r
|
|
|
|
|
|
|
|
|
|
private
|
2023-03-08 10:46:29 -05:00
|
|
|
|
resTokenizer1 : Reserved1 -> String -> Tokenizer TokenW
|
2023-03-04 15:02:51 -05:00
|
|
|
|
resTokenizer1 r str =
|
2023-03-08 10:46:29 -05:00
|
|
|
|
let res : String -> Token := const $ Reserved str in
|
|
|
|
|
case r of Word w => match (exact w <+> reject idContEnd) res
|
|
|
|
|
Sym s => match (exact s <+> reject symCont) res
|
|
|
|
|
Punc x => match (is x) res
|
2023-03-04 15:02:51 -05:00
|
|
|
|
|
|
|
|
|
||| match a reserved token
|
|
|
|
|
export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
resTokenizer : Reserved -> Tokenizer TokenW
|
2023-03-04 15:02:51 -05:00
|
|
|
|
resTokenizer (Only r) = resTokenizer1 r (resString1 r)
|
|
|
|
|
resTokenizer (r `Or` s) =
|
|
|
|
|
resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)
|
|
|
|
|
|
|
|
|
|
||| reserved words & symbols.
|
2023-03-08 10:46:29 -05:00
|
|
|
|
||| the tokens recognised by ``a `Or` b`` will be `Reserved a`.
|
|
|
|
|
||| e.g. `=>` in the input (if not part of a longer name)
|
|
|
|
|
||| will be returned as `Reserved "⇒"`.
|
2023-03-04 15:02:51 -05:00
|
|
|
|
public export
|
2023-03-08 10:46:29 -05:00
|
|
|
|
reserved : List Reserved
|
2023-03-04 15:02:51 -05:00
|
|
|
|
reserved =
|
2023-03-08 10:46:29 -05:00
|
|
|
|
[Punc1 '(', Punc1 ')', Punc1 '[', Punc1 ']', Punc1 '{', Punc1 '}',
|
|
|
|
|
Punc1 ',', Punc1 ';',
|
|
|
|
|
Sym1 "@",
|
|
|
|
|
Sym1 ":",
|
|
|
|
|
Sym "⇒" `Or` Sym "=>",
|
|
|
|
|
Sym "→" `Or` Sym "->",
|
|
|
|
|
Sym "×" `Or` Sym "**",
|
|
|
|
|
Sym "≡" `Or` Sym "==",
|
|
|
|
|
Sym "∷" `Or` Sym "::",
|
|
|
|
|
Sym "·" `Or` Punc '.',
|
|
|
|
|
Word1 "case",
|
|
|
|
|
Word1 "case1",
|
|
|
|
|
Word "caseω" `Or` Word "case#",
|
|
|
|
|
Word1 "return",
|
|
|
|
|
Word1 "of",
|
|
|
|
|
Word1 "_",
|
|
|
|
|
Word1 "Eq",
|
|
|
|
|
Word "λ" `Or` Word "fun",
|
|
|
|
|
Word "δ" `Or` Word "dfun",
|
|
|
|
|
Word "ω" `Or` Sym "#",
|
|
|
|
|
Sym "★" `Or` Word "Type",
|
|
|
|
|
Word1 "def",
|
|
|
|
|
Word1 "def0",
|
|
|
|
|
Word "defω" `Or` Word "def#",
|
2023-03-12 13:28:37 -04:00
|
|
|
|
Sym "≔" `Or` Sym ":=",
|
|
|
|
|
Word1 "load",
|
|
|
|
|
Word1 "namespace"]
|
2023-03-08 10:46:29 -05:00
|
|
|
|
|
|
|
|
|
||| `IsReserved str` is true if `Reserved str` might actually show up in
|
2023-03-04 15:02:51 -05:00
|
|
|
|
||| the token stream
|
|
|
|
|
public export
|
|
|
|
|
IsReserved : String -> Type
|
|
|
|
|
IsReserved str = str `Elem` map resString reserved
|
|
|
|
|
|
2023-02-28 14:51:54 -05:00
|
|
|
|
export
|
|
|
|
|
tokens : Tokenizer TokenW
|
|
|
|
|
tokens = choice $
|
|
|
|
|
map skip [pred isWhitespace,
|
|
|
|
|
lineComment (exact "--" <+> reject symCont),
|
|
|
|
|
blockComment (exact "{-") (exact "-}")] <+>
|
2023-03-04 15:02:51 -05:00
|
|
|
|
[universe] <+> -- ★ᵢ takes precedence over bare ★
|
|
|
|
|
map resTokenizer reserved <+>
|
2023-02-28 14:51:54 -05:00
|
|
|
|
[nat, string, tag, name]
|
|
|
|
|
|
|
|
|
|
export
|
|
|
|
|
lex : String -> Either Error (List (WithBounds Token))
|
|
|
|
|
lex str =
|
|
|
|
|
let (res, reason, line, col, str) = lex tokens str in
|
|
|
|
|
case reason of
|
|
|
|
|
EndInput => Right $ mapMaybe sequence res
|
|
|
|
|
_ => Left $ Err {reason, line, col, char = index 0 str}
|