module Quox.Parser.Lexer import Quox.CharExtra import Quox.Name import Data.String.Extra import Data.SortedMap import public Data.String -- for singleton to reduce in IsReserved import public Data.List.Elem import public Text.Lexer import public Text.Lexer.Tokenizer import Derive.Prelude %hide TT.Name %default total %language ElabReflection ||| @ Reserved reserved token ||| @ Name name, possibly qualified ||| @ Nat nat literal ||| @ Str string literal ||| @ Tag tag literal ||| @ TYPE "Type" or "★" with ascii nat directly after ||| @ Sup superscript or ^ number (displacement, or universe for ★) public export data Token = Reserved String | Name PName | Nat Nat | Str String | Tag String | TYPE Nat | Sup Nat %runElab derive "Token" [Eq, Ord, Show] ||| token or whitespace ||| @ Skip whitespace, comments, etc ||| @ Invalid a token which failed a post-lexer check ||| (e.g. a qualified name containing a keyword) ||| @ T a well formed token public export data ExtToken = Skip | Invalid String String | T Token %runElab derive "ExtToken" [Eq, Ord, Show] public export data ErrorReason = NoRuleApply | ComposeNotClosing (Int, Int) (Int, Int) | Other String %runElab derive "ErrorReason" [Eq, Ord, Show] public export record Error where constructor Err reason : ErrorReason line, col : Int ||| `Nothing` if the error is at the end of the input char : Maybe Char %runElab derive "StopReason" [Eq, Ord, Show] %runElab derive "Error" [Eq, Ord, Show] private skip : Lexer -> Tokenizer ExtToken skip t = match t $ const Skip private tmatch : Lexer -> (String -> Token) -> Tokenizer ExtToken tmatch t f = match t (T . f) ||| [todo] escapes other than `\"` and (accidentally) `\\` export fromStringLit : String -> String fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where go : List Char -> List Char go [] = [] go ['\\'] = ['\\'] -- i guess??? go ('\\' :: c :: cs) = c :: go cs go (c :: cs) = c :: go cs private string : Tokenizer ExtToken string = tmatch stringLit (Str . fromStringLit) private nat : Tokenizer ExtToken nat = tmatch (some (range '0' '9')) (Nat . cast) private tag : Tokenizer ExtToken tag = tmatch (is '\'' <+> name) (Tag . drop 1) <|> tmatch (is '\'' <+> stringLit) (Tag . fromStringLit . drop 1) private %inline fromSub : Char -> Char fromSub c = case c of '₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4' '₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c private %inline fromSup : Char -> Char fromSup c = case c of '⁰' => '0'; '¹' => '1'; '²' => '2'; '³' => '3'; '⁴' => '4' '⁵' => '5'; '⁶' => '6'; '⁷' => '7'; '⁸' => '8'; '⁹' => '9'; _ => c private %inline subToNat : String -> Nat subToNat = cast . pack . map fromSub . unpack private %inline supToNat : String -> Nat supToNat = cast . pack . map fromSup . unpack -- ★0, Type0. base ★/Type is a Reserved private universe : Tokenizer ExtToken universe = universeWith "★" <|> universeWith "Type" where universeWith : String -> Tokenizer ExtToken universeWith pfx = let len = length pfx in tmatch (exact pfx <+> digits) (TYPE . cast . drop len) private sup : Tokenizer ExtToken sup = tmatch (some $ pred isSupDigit) (Sup . supToNat) <|> tmatch (is '^' <+> digits) (Sup . cast . drop 1) private %inline choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a choice (t :: ts) = foldl (\a, b => a <|> b) t ts namespace Reserved ||| description of a reserved symbol ||| @ Word a reserved word (must not be followed by letters, digits, etc) ||| @ Sym a reserved symbol (must not be followed by symbolic chars) ||| @ Punc a character that doesn't show up in names (brackets, etc); ||| also a sequence ending in one of those, like `#[`, since the ||| difference relates to lookahead public export data Reserved1 = Word String | Sym String | Punc String %runElab derive "Reserved1" [Eq, Ord, Show] ||| description of a token that might have unicode & ascii-only aliases public export data Reserved = Only Reserved1 | Or Reserved1 Reserved1 %runElab derive "Reserved" [Eq, Ord, Show] public export Sym1, Word1, Punc1 : String -> Reserved Sym1 = Only . Sym Word1 = Only . Word Punc1 = Only . Punc public export resString1 : Reserved1 -> String resString1 (Punc x) = x resString1 (Word w) = w resString1 (Sym s) = s ||| return the representative string for a token description. if there are ||| two, then it's the first one, which should be the full-unicode one public export resString : Reserved -> String resString (Only r) = resString1 r resString (r `Or` _) = resString1 r ||| return both representative strings for a token description public export resString2 : Reserved -> List String resString2 (Only r) = [resString1 r] resString2 (r `Or` s) = [resString1 r, resString1 s] private resTokenizer1 : Reserved1 -> String -> Tokenizer ExtToken resTokenizer1 r str = let res : String -> Token := const $ Reserved str in case r of Word w => tmatch (exact w <+> reject idContEnd) res Sym s => tmatch (exact s <+> reject symCont) res Punc x => tmatch (exact x) res ||| match a reserved token export resTokenizer : Reserved -> Tokenizer ExtToken resTokenizer (Only r) = resTokenizer1 r (resString1 r) resTokenizer (r `Or` s) = resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r) ||| reserved words & symbols. ||| the tokens recognised by ``a `Or` b`` will be `Reserved a`. ||| e.g. `=>` in the input (if not part of a longer name) ||| will be returned as `Reserved "⇒"`. public export reserved : List Reserved reserved = [Punc1 "(", Punc1 ")", Punc1 "[", Punc1 "]", Punc1 "{", Punc1 "}", Punc1 ",", Punc1 ";", Punc1 "#[", Sym1 "@", Sym1 ":", Sym "⇒" `Or` Sym "=>", Sym "→" `Or` Sym "->", Sym "×" `Or` Sym "**", Sym "≡" `Or` Sym "==", Sym "∷" `Or` Sym "::", Punc1 ".", Word1 "case", Word1 "case0", Word1 "case1", Word "caseω" `Or` Word "case#", Word1 "return", Word1 "of", Word1 "fst", Word1 "snd", Word1 "_", Word1 "Eq", Word "λ" `Or` Word "fun", Word "δ" `Or` Word "dfun", Word "ω" `Or` Sym "#", Sym "★" `Or` Word "Type", Word "ℕ" `Or` Word "Nat", Word1 "zero", Word1 "succ", Word1 "coe", Word1 "comp", Word1 "def", Word1 "def0", Word "defω" `Or` Word "def#", Sym1 "=", Word1 "load", Word1 "namespace"] public export reservedStrings : List String reservedStrings = map resString reserved public export allReservedStrings : List String allReservedStrings = foldMap resString2 reserved ||| `IsReserved str` is true if `Reserved str` might actually show up in ||| the token stream public export IsReserved : String -> Type IsReserved str = So (str `elem` allReservedStrings) private name : Tokenizer ExtToken name = match name $ \str => let parts = split (== '.') $ normalizeNfc str in case find (`elem` allReservedStrings) (toList parts) of Nothing => T $ Name $ fromListP parts Just w => Invalid "reserved word '\{w}' inside name \{str}" str export tokens : Tokenizer ExtToken tokens = choice $ map skip [pred isWhitespace, lineComment (exact "--" <+> reject symCont), blockComment (exact "{-") (exact "-}")] <+> [universe] <+> -- ★ᵢ takes precedence over bare ★ map resTokenizer reserved <+> [sup, nat, string, tag, name] export check : Alternative f => WithBounds ExtToken -> Either Error (f (WithBounds Token)) check (MkBounded val irr bounds@(MkBounds line col _ _)) = case val of Skip => Right empty T tok => Right $ pure $ MkBounded tok irr bounds Invalid msg tok => Left $ Err (Other msg) line col (index 0 tok) export toErrorReason : StopReason -> Maybe ErrorReason toErrorReason EndInput = Nothing toErrorReason NoRuleApply = Just NoRuleApply toErrorReason (ComposeNotClosing s e) = Just $ ComposeNotClosing s e export lex : String -> Either Error (List (WithBounds Token)) lex str = let (res, reason, line, col, str) = lex tokens str in case toErrorReason reason of Nothing => concatMap check res @{MonoidApplicative} Just e => Left $ Err {reason = e, line, col, char = index 0 str}