quox/lib/Quox/Lexer.idr

183 lines
5 KiB
Idris
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

module Quox.Lexer
import Quox.CharExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
import Text.Lexer
import Text.Lexer.Tokenizer
import Derive.Prelude
%hide TT.Name
%default total
%language ElabReflection
||| @ R reserved token
||| @ I identifier
||| @ N nat literal
||| @ S string literal
||| @ T tag literal
||| @ TYPE "Type" or "★" with subscript
public export
data Token = R String | I Name | N Nat | S String | T String | TYPE Nat
%runElab derive "Token" [Eq, Ord, Show]
-- token or whitespace
private
0 TokenW : Type
TokenW = Maybe Token
public export
record Error where
constructor Err
reason : StopReason
line, col : Int
||| `Nothing` if the error is at the end of the input
char : Maybe Char
%runElab derive "StopReason" [Eq, Ord, Show]
%runElab derive "Error" [Eq, Ord, Show]
skip : Lexer -> Tokenizer TokenW
skip t = match t $ const Nothing
export
syntaxChars : List Char
syntaxChars = ['(', ')', '[', ']', '{', '}', '`', '"', '\'', ',', '.']
export
isSymStart, isSymCont : Char -> Bool
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
isSymCont c = c == '\'' || isSymStart c
export
idStart, idCont, idEnd, idContEnd, symStart, symCont : Lexer
idStart = pred isIdStart
idCont = pred isIdCont
idEnd = pred $ \c => c `elem` unpack "?!#"
idContEnd = idCont <|> idEnd
symStart = pred isSymStart
symCont = pred isSymCont
private %inline
resVal : String -> a -> Maybe Token
resVal str = const $ Just $ R str
export
resWordL, resSymL, syntaxL : String -> Lexer
resWordL str = exact str <+> reject idContEnd
resSymL str = exact str <+> reject symCont
syntaxL str = exact str
export
resWord, resSym, syntax : String -> Tokenizer TokenW
resWord str = match (resWordL str) (resVal str)
resSym str = match (resSymL str) (resVal str)
syntax str = match (syntaxL str) (resVal str)
-- return value contains unicode version
export
uresWord, uresSym, usyntax : (unicode, ascii : String) -> Tokenizer TokenW
uresWord u a = match (resWordL u <|> resWordL a) (resVal u)
uresSym u a = match (resSymL u <|> resSymL a) (resVal u)
usyntax u a = match (exact u <|> exact a) (resVal u)
export
alphaName, symName, baseNameL : Lexer
alphaName = idStart <+> many idCont <+> many idEnd
symName = symStart <+> many symCont
baseNameL = symName <|> alphaName
export
baseName : Tokenizer BaseName
baseName = match baseNameL UN
private
toName : String -> Name
toName = fromList . split (== '.')
export
name : Tokenizer TokenW
name = match (baseNameL <+> many (is '.' <+> baseNameL))
(Just . I . toName . normalizeNfc)
-- [todo] escapes other than \" and (accidentally) \\
export
fromStringLit : String -> String
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
go : List Char -> List Char
go [] = []
go ['\\'] = ['\\'] -- i guess???
go ('\\' :: c :: cs) = c :: go cs
go (c :: cs) = c :: go cs
export
string : Tokenizer TokenW
string = match stringLit (Just . S . fromStringLit)
export
nat : Tokenizer TokenW
nat = match (some (range '0' '9')) (Just . N . cast)
export
tag : Tokenizer TokenW
tag = match (is '`' <+> baseNameL) (Just . T . drop 1)
<|> match (is '`' <+> stringLit) (Just . T . fromStringLit . drop 1)
private %inline
fromSub : Char -> Char
fromSub c = case c of
'' => '0'; '' => '1'; '' => '2'; '' => '3'; '' => '4'
'' => '5'; '' => '6'; '' => '7'; '' => '8'; '' => '9'; _ => c
private %inline
subToNat : String -> Nat
subToNat = cast . pack . map fromSub . unpack
export
universe : Tokenizer TokenW
universe = universeWith "" <|> universeWith "Type" where
universeWith : String -> Tokenizer TokenW
universeWith pfx =
let len = length pfx in
match (exact pfx <+> some (range '0' '9'))
(Just . TYPE . cast . drop len) <|>
match (exact pfx <+> some (range '' ''))
(Just . TYPE . subToNat . drop len)
export
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
export
tokens : Tokenizer TokenW
tokens = choice $
map skip [pred isWhitespace,
lineComment (exact "--" <+> reject symCont),
blockComment (exact "{-") (exact "-}")] <+>
map syntax ["(", ")", "[", "]", ",", "{", "}", "."] <+>
[match (exact "`{" <+> reject (is '-')) (resVal "`{")] <+>
map resSym ["@", ":"] <+>
map (uncurry uresSym)
[("·", "%"), ("","->"), ("×", "**"), ("", "=="), ("", "::")] <+>
map resWord ["case", "case1", "caseω", "return", "of", "_"] <+>
map (uncurry uresWord) [("λ","fun"), ("δ","dfun"), ("caseω", "case#")] <+>
[resWord "ω", match (resSymL "#") (resVal "ω"),
universe, resSym "", match (resWordL "Type") (resVal "")] <+>
[nat, string, tag, name]
export
lex : String -> Either Error (List (WithBounds Token))
lex str =
let (res, reason, line, col, str) = lex tokens str in
case reason of
EndInput => Right $ mapMaybe sequence res
_ => Left $ Err {reason, line, col, char = index 0 str}