module Quox.Lexer import Quox.CharExtra import Quox.Name import Data.String.Extra import Data.SortedMap import public Data.List.Elem import public Text.Lexer import public Text.Lexer.Tokenizer import Derive.Prelude %hide TT.Name %default total %language ElabReflection ||| @ R reserved token ||| @ I identifier ||| @ N nat literal ||| @ S string literal ||| @ T tag literal ||| @ TYPE "Type" or "★" with subscript public export data Token = R String | I Name | N Nat | S String | T String | TYPE Nat %runElab derive "Token" [Eq, Ord, Show] -- token or whitespace public export 0 TokenW : Type TokenW = Maybe Token public export record Error where constructor Err reason : StopReason line, col : Int ||| `Nothing` if the error is at the end of the input char : Maybe Char %runElab derive "StopReason" [Eq, Ord, Show] %runElab derive "Error" [Eq, Ord, Show] private skip : Lexer -> Tokenizer TokenW skip t = match t $ const Nothing private match : Lexer -> (String -> Token) -> Tokenizer TokenW match t f = Tokenizer.match t (Just . f) %hide Tokenizer.match export %inline syntaxChars : List Char syntaxChars = ['(', ')', '[', ']', '{', '}', '"', '\'', ',', '.', ';'] private stra, isSymCont : Char -> Bool isSymStart c = not (c `elem` syntaxChars) && isSymChar c isSymCont c = c == '\'' || isSymStart c private idStart, idCont, idEnd, idContEnd : Lexer idStart = pred isIdStart idCont = pred isIdCont idEnd = pred $ \c => c `elem` unpack "?!#" idContEnd = idCont <|> idEnd private symStart, symCont : Lexer symStart = pred isSymStart symCont = pred isSymCont private baseNameL : Lexer baseNameL = idStart <+> many idCont <+> many idEnd <|> symStart <+> many symCont private nameL : Lexer nameL = baseNameL <+> many (is '.' <+> baseNameL) private name : Tokenizer TokenW name = match nameL $ I . fromList . split (== '.') . normalizeNfc ||| [todo] escapes other than `\"` and (accidentally) `\\` export fromStringLit : String -> String fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where go : List Char -> List Char go [] = [] go ['\\'] = ['\\'] -- i guess??? go ('\\' :: c :: cs) = c :: go cs go (c :: cs) = c :: go cs private string : Tokenizer TokenW string = match stringLit (S . fromStringLit) private nat : Tokenizer TokenW nat = match (some (range '0' '9')) (N . cast) private tag : Tokenizer TokenW tag = match (is '\'' <+> nameL) (T . drop 1) <|> match (is '\'' <+> stringLit) (T . fromStringLit . drop 1) private %inline fromSub : Char -> Char fromSub c = case c of '₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4' '₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c private %inline subToNat : String -> Nat subToNat = cast . pack . map fromSub . unpack private universe : Tokenizer TokenW universe = universeWith "★" <|> universeWith "Type" where universeWith : String -> Tokenizer TokenW universeWith pfx = let len = length pfx in match (exact pfx <+> some (range '0' '9')) (TYPE . cast . drop len) <|> match (exact pfx <+> some (range '₀' '₉')) (TYPE . subToNat . drop len) private %inline choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a choice (t :: ts) = foldl (\a, b => a <|> b) t ts namespace Res ||| description of a reserved symbol ||| @ W a reserved word (must not be followed by letters, digits, etc) ||| @ S a reserved symbol (must not be followed by symbolic chars) ||| @ X a character that doesn't show up in names (brackets, etc) public export data Res1 = W String | S String | X Char %runElab derive "Res1" [Eq, Ord, Show] ||| description of a token that might have unicode & ascii-only aliases public export data Res = Only Res1 | Or Res1 Res1 %runElab derive "Res" [Eq, Ord, Show] public export S1, W1 : String -> Res S1 = Only . S W1 = Only . W public export X1 : Char -> Res X1 = Only . X public export resString1 : Res1 -> String resString1 (X x) = singleton x resString1 (W w) = w resString1 (S s) = s ||| return the representative string for a token description. if there are ||| two, then it's the first one, which should be the full-unicode one public export resString : Res -> String resString (Only r) = resString1 r resString (r `Or` _) = resString1 r private resTokenizer1 : Res1 -> String -> Tokenizer TokenW resTokenizer1 r str = let res : String -> Token := const $ R str in case r of W w => match (exact w <+> reject idContEnd) res S s => match (exact s <+> reject symCont) res X x => match (is x) res ||| match a reserved token export resTokenizer : Res -> Tokenizer TokenW resTokenizer (Only r) = resTokenizer1 r (resString1 r) resTokenizer (r `Or` s) = resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r) ||| reserved words & symbols. ||| the tokens recognised by ``a `Or` b`` will be `R a`. e.g. `=>` in the input ||| (if not part of a longer name) will be returned as `R "⇒"`. public export reserved : List Res reserved = [X1 '(', X1 ')', X1 '[', X1 ']', X1 '{', X1 '}', X1 ',', X1 ';', S1 "@", S1 ":", S "⇒" `Or` S "=>", S "→" `Or` S "->", S "×" `Or` S "**", S "≡" `Or` S "==", S "∷" `Or` S "::", S "·" `Or` X '.', W1 "case", W1 "case1", W "caseω" `Or` W "case#", W1 "return", W1 "of", W1 "_", W1 "Eq", W "λ" `Or` W "fun", W "δ" `Or` W "dfun", W "ω" `Or` S "#", S "★" `Or` W "Type"] ||| `IsReserved str` is true if `R str` might actually show up in ||| the token stream public export IsReserved : String -> Type IsReserved str = str `Elem` map resString reserved export tokens : Tokenizer TokenW tokens = choice $ map skip [pred isWhitespace, lineComment (exact "--" <+> reject symCont), blockComment (exact "{-") (exact "-}")] <+> [universe] <+> -- ★ᵢ takes precedence over bare ★ map resTokenizer reserved <+> [nat, string, tag, name] export lex : String -> Either Error (List (WithBounds Token)) lex str = let (res, reason, line, col, str) = lex tokens str in case reason of EndInput => Right $ mapMaybe sequence res _ => Left $ Err {reason, line, col, char = index 0 str}