module Quox.Parser.Lexer import Quox.CharExtra import Quox.Name import Data.String.Extra import Data.SortedMap import public Data.String -- for singleton to reduce in IsReserved import public Data.List.Elem import public Text.Lexer import public Text.Lexer.Tokenizer import Derive.Prelude %hide TT.Name %default total %language ElabReflection ||| @ Reserved reserved token ||| @ Name name, possibly qualified ||| @ Nat nat literal ||| @ String string literal ||| @ Tag tag literal ||| @ TYPE "Type" or "★" with ascii nat directly after ||| @ Sup superscript or ^ number (displacement, or universe for ★) public export data Token = Reserved String | Name PName | Nat Nat | Str String | Tag String | TYPE Nat | Sup Nat %runElab derive "Token" [Eq, Ord, Show] -- token or whitespace public export 0 TokenW : Type TokenW = Maybe Token public export record Error where constructor Err reason : StopReason line, col : Int ||| `Nothing` if the error is at the end of the input char : Maybe Char %runElab derive "StopReason" [Eq, Ord, Show] %runElab derive "Error" [Eq, Ord, Show] private skip : Lexer -> Tokenizer TokenW skip t = match t $ const Nothing private match : Lexer -> (String -> Token) -> Tokenizer TokenW match t f = Tokenizer.match t (Just . f) %hide Tokenizer.match private name : Tokenizer TokenW name = match name $ Name . fromListP . split (== '.') . normalizeNfc ||| [todo] escapes other than `\"` and (accidentally) `\\` export fromStringLit : String -> String fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where go : List Char -> List Char go [] = [] go ['\\'] = ['\\'] -- i guess??? go ('\\' :: c :: cs) = c :: go cs go (c :: cs) = c :: go cs private string : Tokenizer TokenW string = match stringLit (Str . fromStringLit) private nat : Tokenizer TokenW nat = match (some (range '0' '9')) (Nat . cast) private tag : Tokenizer TokenW tag = match (is '\'' <+> name) (Tag . drop 1) <|> match (is '\'' <+> stringLit) (Tag . fromStringLit . drop 1) private %inline fromSub : Char -> Char fromSub c = case c of '₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4' '₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c private %inline fromSup : Char -> Char fromSup c = case c of '⁰' => '0'; '¹' => '1'; '²' => '2'; '³' => '3'; '⁴' => '4' '⁵' => '5'; '⁶' => '6'; '⁷' => '7'; '⁸' => '8'; '⁹' => '9'; _ => c private %inline subToNat : String -> Nat subToNat = cast . pack . map fromSub . unpack private %inline supToNat : String -> Nat supToNat = cast . pack . map fromSup . unpack -- ★0, Type0. base ★/Type is a Reserved private universe : Tokenizer TokenW universe = universeWith "★" <|> universeWith "Type" where universeWith : String -> Tokenizer TokenW universeWith pfx = let len = length pfx in match (exact pfx <+> digits) (TYPE . cast . drop len) private sup : Tokenizer TokenW sup = match (some $ pred isSupDigit) (Sup . supToNat) <|> match (is '^' <+> digits) (Sup . cast . drop 1) private %inline choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a choice (t :: ts) = foldl (\a, b => a <|> b) t ts namespace Reserved ||| description of a reserved symbol ||| @ Word a reserved word (must not be followed by letters, digits, etc) ||| @ Sym a reserved symbol (must not be followed by symbolic chars) ||| @ Punc a character that doesn't show up in names (brackets, etc) public export data Reserved1 = Word String | Sym String | Punc Char %runElab derive "Reserved1" [Eq, Ord, Show] ||| description of a token that might have unicode & ascii-only aliases public export data Reserved = Only Reserved1 | Or Reserved1 Reserved1 %runElab derive "Reserved" [Eq, Ord, Show] public export Sym1, Word1 : String -> Reserved Sym1 = Only . Sym Word1 = Only . Word public export Punc1 : Char -> Reserved Punc1 = Only . Punc public export resString1 : Reserved1 -> String resString1 (Punc x) = singleton x resString1 (Word w) = w resString1 (Sym s) = s ||| return the representative string for a token description. if there are ||| two, then it's the first one, which should be the full-unicode one public export resString : Reserved -> String resString (Only r) = resString1 r resString (r `Or` _) = resString1 r private resTokenizer1 : Reserved1 -> String -> Tokenizer TokenW resTokenizer1 r str = let res : String -> Token := const $ Reserved str in case r of Word w => match (exact w <+> reject idContEnd) res Sym s => match (exact s <+> reject symCont) res Punc x => match (is x) res ||| match a reserved token export resTokenizer : Reserved -> Tokenizer TokenW resTokenizer (Only r) = resTokenizer1 r (resString1 r) resTokenizer (r `Or` s) = resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r) ||| reserved words & symbols. ||| the tokens recognised by ``a `Or` b`` will be `Reserved a`. ||| e.g. `=>` in the input (if not part of a longer name) ||| will be returned as `Reserved "⇒"`. public export reserved : List Reserved reserved = [Punc1 '(', Punc1 ')', Punc1 '[', Punc1 ']', Punc1 '{', Punc1 '}', Punc1 ',', Punc1 ';', Sym1 "@", Sym1 ":", Sym "⇒" `Or` Sym "=>", Sym "→" `Or` Sym "->", Sym "×" `Or` Sym "**", Sym "≡" `Or` Sym "==", Sym "∷" `Or` Sym "::", Sym "⊲" `Or` Sym "<|", Sym "⋄" `Or` Sym "<>", Punc1 '.', Word1 "case", Word1 "case0", Word1 "case1", Word "caseω" `Or` Word "case#", Word1 "return", Word1 "of", Word1 "_", Word1 "Eq", Word "λ" `Or` Word "fun", Word "δ" `Or` Word "dfun", Word "ω" `Or` Sym "#", Sym "★" `Or` Word "Type", Word "ℕ" `Or` Word "Nat", Word1 "zero", Word1 "succ", Word1 "coe", Word1 "comp", Word1 "def", Word1 "def0", Word "defω" `Or` Word "def#", Sym1 "=", Word1 "load", Word1 "namespace"] ||| `IsReserved str` is true if `Reserved str` might actually show up in ||| the token stream public export IsReserved : String -> Type IsReserved str = str `Elem` map resString reserved export tokens : Tokenizer TokenW tokens = choice $ map skip [pred isWhitespace, lineComment (exact "--" <+> reject symCont), blockComment (exact "{-") (exact "-}")] <+> [universe] <+> -- ★ᵢ takes precedence over bare ★ map resTokenizer reserved <+> [sup, nat, string, tag, name] export lex : String -> Either Error (List (WithBounds Token)) lex str = let (res, reason, line, col, str) = lex tokens str in case reason of EndInput => Right $ mapMaybe sequence res _ => Left $ Err {reason, line, col, char = index 0 str}