module Quox.Lexer import Quox.CharExtra import Quox.Name import Data.String.Extra import Data.SortedMap import Text.Lexer import Text.Lexer.Tokenizer import Derive.Prelude %hide TT.Name %default total %language ElabReflection ||| @ R reserved token ||| @ I identifier ||| @ N nat literal ||| @ S string literal ||| @ T tag literal ||| @ TYPE "Type" or "★" with subscript public export data Token = R String | I Name | N Nat | S String | T String | TYPE Nat %runElab derive "Token" [Eq, Ord, Show] -- token or whitespace private 0 TokenW : Type TokenW = Maybe Token public export record Error where constructor Err reason : StopReason line, col : Int ||| `Nothing` if the error is at the end of the input char : Maybe Char %runElab derive "StopReason" [Eq, Ord, Show] %runElab derive "Error" [Eq, Ord, Show] skip : Lexer -> Tokenizer TokenW skip t = match t $ const Nothing export syntaxChars : List Char syntaxChars = ['(', ')', '[', ']', '{', '}', '`', '"', '\'', ',', '.'] export isSymStart, isSymCont : Char -> Bool isSymStart c = not (c `elem` syntaxChars) && isSymChar c isSymCont c = c == '\'' || isSymStart c export idStart, idCont, idEnd, idContEnd, symStart, symCont : Lexer idStart = pred isIdStart idCont = pred isIdCont idEnd = pred $ \c => c `elem` unpack "?!#" idContEnd = idCont <|> idEnd symStart = pred isSymStart symCont = pred isSymCont private %inline resVal : String -> a -> Maybe Token resVal str = const $ Just $ R str export resWordL, resSymL, syntaxL : String -> Lexer resWordL str = exact str <+> reject idContEnd resSymL str = exact str <+> reject symCont syntaxL str = exact str export resWord, resSym, syntax : String -> Tokenizer TokenW resWord str = match (resWordL str) (resVal str) resSym str = match (resSymL str) (resVal str) syntax str = match (syntaxL str) (resVal str) -- return value contains unicode version export uresWord, uresSym, usyntax : (unicode, ascii : String) -> Tokenizer TokenW uresWord u a = match (resWordL u <|> resWordL a) (resVal u) uresSym u a = match (resSymL u <|> resSymL a) (resVal u) usyntax u a = match (exact u <|> exact a) (resVal u) export alphaName, symName, baseNameL : Lexer alphaName = idStart <+> many idCont <+> many idEnd symName = symStart <+> many symCont baseNameL = symName <|> alphaName export baseName : Tokenizer BaseName baseName = match baseNameL UN private toName : String -> Name toName = fromList . split (== '.') export name : Tokenizer TokenW name = match (baseNameL <+> many (is '.' <+> baseNameL)) (Just . I . toName . normalizeNfc) -- [todo] escapes other than \" and (accidentally) \\ export fromStringLit : String -> String fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where go : List Char -> List Char go [] = [] go ['\\'] = ['\\'] -- i guess??? go ('\\' :: c :: cs) = c :: go cs go (c :: cs) = c :: go cs export string : Tokenizer TokenW string = match stringLit (Just . S . fromStringLit) export nat : Tokenizer TokenW nat = match (some (range '0' '9')) (Just . N . cast) export tag : Tokenizer TokenW tag = match (is '`' <+> baseNameL) (Just . T . drop 1) <|> match (is '`' <+> stringLit) (Just . T . fromStringLit . drop 1) private %inline fromSub : Char -> Char fromSub c = case c of '₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4' '₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c private %inline subToNat : String -> Nat subToNat = cast . pack . map fromSub . unpack export universe : Tokenizer TokenW universe = universeWith "★" <|> universeWith "Type" where universeWith : String -> Tokenizer TokenW universeWith pfx = let len = length pfx in match (exact pfx <+> some (range '0' '9')) (Just . TYPE . cast . drop len) <|> match (exact pfx <+> some (range '₀' '₉')) (Just . TYPE . subToNat . drop len) export choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a choice (t :: ts) = foldl (\a, b => a <|> b) t ts export tokens : Tokenizer TokenW tokens = choice $ map skip [pred isWhitespace, lineComment (exact "--" <+> reject symCont), blockComment (exact "{-") (exact "-}")] <+> map syntax ["(", ")", "[", "]", ",", "{", "}", "."] <+> [match (exact "`{" <+> reject (is '-')) (resVal "`{")] <+> map resSym ["@", ":"] <+> map (uncurry uresSym) [("·", "%"), ("→","->"), ("×", "**"), ("≡", "=="), ("∷", "::")] <+> map resWord ["case", "case1", "caseω", "return", "of", "_"] <+> map (uncurry uresWord) [("λ","fun"), ("δ","dfun"), ("caseω", "case#")] <+> [resWord "ω", match (resSymL "#") (resVal "ω"), universe, resSym "★", match (resWordL "Type") (resVal "★")] <+> [nat, string, tag, name] export lex : String -> Either Error (List (WithBounds Token)) lex str = let (res, reason, line, col, str) = lex tokens str in case reason of EndInput => Right $ mapMaybe sequence res _ => Left $ Err {reason, line, col, char = index 0 str}