242 lines
6.2 KiB
Idris
242 lines
6.2 KiB
Idris
module Quox.Lexer
|
||
|
||
import Quox.CharExtra
|
||
import Quox.Name
|
||
import Data.String.Extra
|
||
import Data.SortedMap
|
||
import public Data.List.Elem
|
||
import public Text.Lexer
|
||
import public Text.Lexer.Tokenizer
|
||
import Derive.Prelude
|
||
%hide TT.Name
|
||
|
||
|
||
%default total
|
||
%language ElabReflection
|
||
|
||
|
||
||| @ R reserved token
|
||
||| @ I identifier
|
||
||| @ N nat literal
|
||
||| @ S string literal
|
||
||| @ T tag literal
|
||
||| @ TYPE "Type" or "★" with subscript
|
||
public export
|
||
data Token = R String | I Name | N Nat | S String | T String | TYPE Nat
|
||
%runElab derive "Token" [Eq, Ord, Show]
|
||
|
||
-- token or whitespace
|
||
public export
|
||
0 TokenW : Type
|
||
TokenW = Maybe Token
|
||
|
||
|
||
public export
|
||
record Error where
|
||
constructor Err
|
||
reason : StopReason
|
||
line, col : Int
|
||
||| `Nothing` if the error is at the end of the input
|
||
char : Maybe Char
|
||
%runElab derive "StopReason" [Eq, Ord, Show]
|
||
%runElab derive "Error" [Eq, Ord, Show]
|
||
|
||
|
||
private
|
||
skip : Lexer -> Tokenizer TokenW
|
||
skip t = match t $ const Nothing
|
||
|
||
private
|
||
match : Lexer -> (String -> Token) -> Tokenizer TokenW
|
||
match t f = Tokenizer.match t (Just . f)
|
||
%hide Tokenizer.match
|
||
|
||
|
||
export %inline
|
||
syntaxChars : List Char
|
||
syntaxChars = ['(', ')', '[', ']', '{', '}', '"', '\'', ',', '.', ';']
|
||
|
||
private
|
||
stra, isSymCont : Char -> Bool
|
||
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
|
||
isSymCont c = c == '\'' || isSymStart c
|
||
|
||
private
|
||
idStart, idCont, idEnd, idContEnd : Lexer
|
||
idStart = pred isIdStart
|
||
idCont = pred isIdCont
|
||
idEnd = pred $ \c => c `elem` unpack "?!#"
|
||
idContEnd = idCont <|> idEnd
|
||
|
||
private
|
||
symStart, symCont : Lexer
|
||
symStart = pred isSymStart
|
||
symCont = pred isSymCont
|
||
|
||
private
|
||
baseNameL : Lexer
|
||
baseNameL = idStart <+> many idCont <+> many idEnd
|
||
<|> symStart <+> many symCont
|
||
|
||
private
|
||
nameL : Lexer
|
||
nameL = baseNameL <+> many (is '.' <+> baseNameL)
|
||
|
||
private
|
||
name : Tokenizer TokenW
|
||
name = match nameL $ I . fromList . split (== '.') . normalizeNfc
|
||
|
||
||| [todo] escapes other than `\"` and (accidentally) `\\`
|
||
export
|
||
fromStringLit : String -> String
|
||
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
|
||
go : List Char -> List Char
|
||
go [] = []
|
||
go ['\\'] = ['\\'] -- i guess???
|
||
go ('\\' :: c :: cs) = c :: go cs
|
||
go (c :: cs) = c :: go cs
|
||
|
||
private
|
||
string : Tokenizer TokenW
|
||
string = match stringLit (S . fromStringLit)
|
||
|
||
private
|
||
nat : Tokenizer TokenW
|
||
nat = match (some (range '0' '9')) (N . cast)
|
||
|
||
private
|
||
tag : Tokenizer TokenW
|
||
tag = match (is '\'' <+> nameL) (T . drop 1)
|
||
<|> match (is '\'' <+> stringLit) (T . fromStringLit . drop 1)
|
||
|
||
|
||
|
||
private %inline
|
||
fromSub : Char -> Char
|
||
fromSub c = case c of
|
||
'₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4'
|
||
'₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c
|
||
|
||
private %inline
|
||
subToNat : String -> Nat
|
||
subToNat = cast . pack . map fromSub . unpack
|
||
|
||
|
||
private
|
||
universe : Tokenizer TokenW
|
||
universe = universeWith "★" <|> universeWith "Type" where
|
||
universeWith : String -> Tokenizer TokenW
|
||
universeWith pfx =
|
||
let len = length pfx in
|
||
match (exact pfx <+> some (range '0' '9'))
|
||
(TYPE . cast . drop len) <|>
|
||
match (exact pfx <+> some (range '₀' '₉'))
|
||
(TYPE . subToNat . drop len)
|
||
|
||
|
||
private %inline
|
||
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
|
||
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
|
||
|
||
|
||
namespace Res
|
||
||| description of a reserved symbol
|
||
||| @ W a reserved word (must not be followed by letters, digits, etc)
|
||
||| @ S a reserved symbol (must not be followed by symbolic chars)
|
||
||| @ X a character that doesn't show up in names (brackets, etc)
|
||
public export
|
||
data Res1 = W String | S String | X Char
|
||
%runElab derive "Res1" [Eq, Ord, Show]
|
||
|
||
||| description of a token that might have unicode & ascii-only aliases
|
||
public export
|
||
data Res = Only Res1 | Or Res1 Res1
|
||
%runElab derive "Res" [Eq, Ord, Show]
|
||
|
||
public export
|
||
S1, W1 : String -> Res
|
||
S1 = Only . S
|
||
W1 = Only . W
|
||
|
||
public export
|
||
X1 : Char -> Res
|
||
X1 = Only . X
|
||
|
||
public export
|
||
resString1 : Res1 -> String
|
||
resString1 (X x) = singleton x
|
||
resString1 (W w) = w
|
||
resString1 (S s) = s
|
||
|
||
||| return the representative string for a token description. if there are
|
||
||| two, then it's the first one, which should be the full-unicode one
|
||
public export
|
||
resString : Res -> String
|
||
resString (Only r) = resString1 r
|
||
resString (r `Or` _) = resString1 r
|
||
|
||
private
|
||
resTokenizer1 : Res1 -> String -> Tokenizer TokenW
|
||
resTokenizer1 r str =
|
||
let res : String -> Token := const $ R str in
|
||
case r of W w => match (exact w <+> reject idContEnd) res
|
||
S s => match (exact s <+> reject symCont) res
|
||
X x => match (is x) res
|
||
|
||
||| match a reserved token
|
||
export
|
||
resTokenizer : Res -> Tokenizer TokenW
|
||
resTokenizer (Only r) = resTokenizer1 r (resString1 r)
|
||
resTokenizer (r `Or` s) =
|
||
resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)
|
||
|
||
||| reserved words & symbols.
|
||
||| the tokens recognised by ``a `Or` b`` will be `R a`. e.g. `=>` in the input
|
||
||| (if not part of a longer name) will be returned as `R "⇒"`.
|
||
public export
|
||
reserved : List Res
|
||
reserved =
|
||
[X1 '(', X1 ')', X1 '[', X1 ']', X1 '{', X1 '}', X1 ',', X1 ';',
|
||
S1 "@",
|
||
S1 ":",
|
||
S "⇒" `Or` S "=>",
|
||
S "→" `Or` S "->",
|
||
S "×" `Or` S "**",
|
||
S "≡" `Or` S "==",
|
||
S "∷" `Or` S "::",
|
||
S "·" `Or` X '.',
|
||
W1 "case",
|
||
W1 "case1",
|
||
W "caseω" `Or` W "case#",
|
||
W1 "return",
|
||
W1 "of",
|
||
W1 "_",
|
||
W1 "Eq",
|
||
W "λ" `Or` W "fun",
|
||
W "δ" `Or` W "dfun",
|
||
W "ω" `Or` S "#",
|
||
S "★" `Or` W "Type"]
|
||
|
||
||| `IsReserved str` is true if `R str` might actually show up in
|
||
||| the token stream
|
||
public export
|
||
IsReserved : String -> Type
|
||
IsReserved str = str `Elem` map resString reserved
|
||
|
||
export
|
||
tokens : Tokenizer TokenW
|
||
tokens = choice $
|
||
map skip [pred isWhitespace,
|
||
lineComment (exact "--" <+> reject symCont),
|
||
blockComment (exact "{-") (exact "-}")] <+>
|
||
[universe] <+> -- ★ᵢ takes precedence over bare ★
|
||
map resTokenizer reserved <+>
|
||
[nat, string, tag, name]
|
||
|
||
export
|
||
lex : String -> Either Error (List (WithBounds Token))
|
||
lex str =
|
||
let (res, reason, line, col, str) = lex tokens str in
|
||
case reason of
|
||
EndInput => Right $ mapMaybe sequence res
|
||
_ => Left $ Err {reason, line, col, char = index 0 str}
|