184 lines
5 KiB
Idris
184 lines
5 KiB
Idris
|
module Quox.Lexer
|
|||
|
|
|||
|
import Quox.CharExtra
|
|||
|
import Quox.Name
|
|||
|
import Data.String.Extra
|
|||
|
import Data.SortedMap
|
|||
|
import Text.Lexer
|
|||
|
import Text.Lexer.Tokenizer
|
|||
|
import Generics.Derive
|
|||
|
%hide TT.Name
|
|||
|
|
|||
|
|
|||
|
%default total
|
|||
|
%language ElabReflection
|
|||
|
|
|||
|
|
|||
|
||| @ R reserved token
|
|||
|
||| @ I identifier
|
|||
|
||| @ N nat literal
|
|||
|
||| @ S string literal
|
|||
|
||| @ T tag literal
|
|||
|
||| @ TYPE "Type" or "★" with subscript
|
|||
|
public export
|
|||
|
data Token = R String | I Name | N Nat | S String | T String | TYPE Nat
|
|||
|
%runElab derive "Token" [Generic, Meta, Eq, Ord, DecEq, Show]
|
|||
|
|
|||
|
-- token or whitespace
|
|||
|
private
|
|||
|
0 TokenW : Type
|
|||
|
TokenW = Maybe Token
|
|||
|
|
|||
|
|
|||
|
public export
|
|||
|
record Error where
|
|||
|
constructor Err
|
|||
|
reason : StopReason
|
|||
|
line, col : Int
|
|||
|
||| `Nothing` if the error is at the end of the input
|
|||
|
char : Maybe Char
|
|||
|
%runElab derive "StopReason" [Generic, Meta, Eq, Ord, DecEq, Show]
|
|||
|
%runElab derive "Error" [Generic, Meta, Eq, Ord, DecEq, Show]
|
|||
|
|
|||
|
|
|||
|
skip : Lexer -> Tokenizer TokenW
|
|||
|
skip t = match t $ const Nothing
|
|||
|
|
|||
|
export
|
|||
|
syntaxChars : List Char
|
|||
|
syntaxChars = ['(', ')', '[', ']', '{', '}', '`', '"', '\'', ',', '.']
|
|||
|
|
|||
|
export
|
|||
|
isSymStart, isSymCont : Char -> Bool
|
|||
|
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
|
|||
|
isSymCont c = c == '\'' || isSymStart c
|
|||
|
|
|||
|
export
|
|||
|
idStart, idCont, idEnd, idContEnd, symStart, symCont : Lexer
|
|||
|
idStart = pred isIdStart
|
|||
|
idCont = pred isIdCont
|
|||
|
idEnd = pred $ \c => c `elem` unpack "?!#"
|
|||
|
idContEnd = idCont <|> idEnd
|
|||
|
symStart = pred isSymStart
|
|||
|
symCont = pred isSymCont
|
|||
|
|
|||
|
private %inline
|
|||
|
resVal : String -> a -> Maybe Token
|
|||
|
resVal str = const $ Just $ R str
|
|||
|
|
|||
|
export
|
|||
|
resWordL, resSymL, syntaxL : String -> Lexer
|
|||
|
resWordL str = exact str <+> reject idContEnd
|
|||
|
resSymL str = exact str <+> reject symCont
|
|||
|
syntaxL str = exact str
|
|||
|
|
|||
|
export
|
|||
|
resWord, resSym, syntax : String -> Tokenizer TokenW
|
|||
|
resWord str = match (resWordL str) (resVal str)
|
|||
|
resSym str = match (resSymL str) (resVal str)
|
|||
|
syntax str = match (syntaxL str) (resVal str)
|
|||
|
|
|||
|
-- return value contains unicode version
|
|||
|
export
|
|||
|
uresWord, uresSym, usyntax : (unicode, ascii : String) -> Tokenizer TokenW
|
|||
|
uresWord u a = match (resWordL u <|> resWordL a) (resVal u)
|
|||
|
uresSym u a = match (resSymL u <|> resSymL a) (resVal u)
|
|||
|
usyntax u a = match (exact u <|> exact a) (resVal u)
|
|||
|
|
|||
|
export
|
|||
|
alphaName, symName, baseNameL : Lexer
|
|||
|
alphaName = idStart <+> many idCont <+> many idEnd
|
|||
|
symName = symStart <+> many symCont
|
|||
|
baseNameL = symName <|> alphaName
|
|||
|
|
|||
|
export
|
|||
|
baseName : Tokenizer BaseName
|
|||
|
baseName = match baseNameL UN
|
|||
|
|
|||
|
private
|
|||
|
toName : String -> Name
|
|||
|
toName = fromList . split (== '.')
|
|||
|
|
|||
|
export
|
|||
|
name : Tokenizer TokenW
|
|||
|
name = match (baseNameL <+> many (is '.' <+> baseNameL))
|
|||
|
(Just . I . toName . normalizeNfc)
|
|||
|
|
|||
|
-- [todo] escapes other than \" and (accidentally) \\
|
|||
|
export
|
|||
|
fromStringLit : String -> String
|
|||
|
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
|
|||
|
go : List Char -> List Char
|
|||
|
go [] = []
|
|||
|
go ['\\'] = ['\\'] -- i guess???
|
|||
|
go ('\\' :: c :: cs) = c :: go cs
|
|||
|
go (c :: cs) = c :: go cs
|
|||
|
|
|||
|
export
|
|||
|
string : Tokenizer TokenW
|
|||
|
string = match stringLit (Just . S . fromStringLit)
|
|||
|
|
|||
|
export
|
|||
|
nat : Tokenizer TokenW
|
|||
|
nat = match (some (range '0' '9')) (Just . N . cast)
|
|||
|
|
|||
|
export
|
|||
|
tag : Tokenizer TokenW
|
|||
|
tag = match (is '`' <+> baseNameL) (Just . T . drop 1)
|
|||
|
<|> match (is '`' <+> stringLit) (Just . T . fromStringLit . drop 1)
|
|||
|
|
|||
|
|
|||
|
|
|||
|
private %inline
|
|||
|
fromSub : Char -> Char
|
|||
|
fromSub c = case c of
|
|||
|
'₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4'
|
|||
|
'₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c
|
|||
|
|
|||
|
private %inline
|
|||
|
subToNat : String -> Nat
|
|||
|
subToNat = cast . pack . map fromSub . unpack
|
|||
|
|
|||
|
|
|||
|
export
|
|||
|
universe : Tokenizer TokenW
|
|||
|
universe = universeWith "★" <|> universeWith "Type" where
|
|||
|
universeWith : String -> Tokenizer TokenW
|
|||
|
universeWith pfx =
|
|||
|
let len = length pfx in
|
|||
|
match (exact pfx <+> some (range '0' '9'))
|
|||
|
(Just . TYPE . cast . drop len) <|>
|
|||
|
match (exact pfx <+> some (range '₀' '₉'))
|
|||
|
(Just . TYPE . subToNat . drop len)
|
|||
|
|
|||
|
|
|||
|
export
|
|||
|
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
|
|||
|
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
|
|||
|
|
|||
|
|
|||
|
export
|
|||
|
tokens : Tokenizer TokenW
|
|||
|
tokens = choice $
|
|||
|
map skip [pred isWhitespace,
|
|||
|
lineComment (exact "--" <+> reject symCont),
|
|||
|
blockComment (exact "{-") (exact "-}")] <+>
|
|||
|
map syntax ["(", ")", "[", "]", ",", "{", "}", "."] <+>
|
|||
|
[match (exact "`{" <+> reject (is '-')) (resVal "`{")] <+>
|
|||
|
map resSym ["@", ":"] <+>
|
|||
|
map (uncurry uresSym)
|
|||
|
[("·", "%"), ("→","->"), ("×", "**"), ("≡", "=="), ("∷", "::")] <+>
|
|||
|
map resWord ["case", "case1", "caseω", "return", "of", "_"] <+>
|
|||
|
map (uncurry uresWord) [("λ","fun"), ("δ","dfun"), ("caseω", "case#")] <+>
|
|||
|
[resWord "ω", match (resSymL "#") (resVal "ω"),
|
|||
|
universe, resSym "★", match (resWordL "Type") (resVal "★")] <+>
|
|||
|
[nat, string, tag, name]
|
|||
|
|
|||
|
export
|
|||
|
lex : String -> Either Error (List (WithBounds Token))
|
|||
|
lex str =
|
|||
|
let (res, reason, line, col, str) = lex tokens str in
|
|||
|
case reason of
|
|||
|
EndInput => Right $ mapMaybe sequence res
|
|||
|
_ => Left $ Err {reason, line, col, char = index 0 str}
|