quox/lib/Quox/Parser/Lexer.idr
2023-09-18 21:53:38 +02:00

245 lines
6.6 KiB
Idris
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

module Quox.Parser.Lexer
import Quox.CharExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
import public Data.String -- for singleton to reduce in IsReserved
import public Data.List.Elem
import public Text.Lexer
import public Text.Lexer.Tokenizer
import Derive.Prelude
%hide TT.Name
%default total
%language ElabReflection
||| @ Reserved reserved token
||| @ Name name, possibly qualified
||| @ Nat nat literal
||| @ String string literal
||| @ Tag tag literal
||| @ TYPE "Type" or "★" with ascii nat directly after
||| @ Sup superscript or ^ number (displacement, or universe for ★)
public export
data Token =
Reserved String
| Name PName
| Nat Nat
| Str String
| Tag String
| TYPE Nat
| Sup Nat
%runElab derive "Token" [Eq, Ord, Show]
-- token or whitespace
public export
0 TokenW : Type
TokenW = Maybe Token
public export
record Error where
constructor Err
reason : StopReason
line, col : Int
||| `Nothing` if the error is at the end of the input
char : Maybe Char
%runElab derive "StopReason" [Eq, Ord, Show]
%runElab derive "Error" [Eq, Ord, Show]
private
skip : Lexer -> Tokenizer TokenW
skip t = match t $ const Nothing
private
match : Lexer -> (String -> Token) -> Tokenizer TokenW
match t f = Tokenizer.match t (Just . f)
%hide Tokenizer.match
private
name : Tokenizer TokenW
name = match name $ Name . fromListP . split (== '.') . normalizeNfc
||| [todo] escapes other than `\"` and (accidentally) `\\`
export
fromStringLit : String -> String
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
go : List Char -> List Char
go [] = []
go ['\\'] = ['\\'] -- i guess???
go ('\\' :: c :: cs) = c :: go cs
go (c :: cs) = c :: go cs
private
string : Tokenizer TokenW
string = match stringLit (Str . fromStringLit)
private
nat : Tokenizer TokenW
nat = match (some (range '0' '9')) (Nat . cast)
private
tag : Tokenizer TokenW
tag = match (is '\'' <+> name) (Tag . drop 1)
<|> match (is '\'' <+> stringLit) (Tag . fromStringLit . drop 1)
private %inline
fromSub : Char -> Char
fromSub c = case c of
'' => '0'; '' => '1'; '' => '2'; '' => '3'; '' => '4'
'' => '5'; '' => '6'; '' => '7'; '' => '8'; '' => '9'; _ => c
private %inline
fromSup : Char -> Char
fromSup c = case c of
'' => '0'; '¹' => '1'; '²' => '2'; '³' => '3'; '' => '4'
'' => '5'; '' => '6'; '' => '7'; '' => '8'; '' => '9'; _ => c
private %inline
subToNat : String -> Nat
subToNat = cast . pack . map fromSub . unpack
private %inline
supToNat : String -> Nat
supToNat = cast . pack . map fromSup . unpack
-- ★0, Type0. base ★/Type is a Reserved
private
universe : Tokenizer TokenW
universe = universeWith "" <|> universeWith "Type" where
universeWith : String -> Tokenizer TokenW
universeWith pfx =
let len = length pfx in
match (exact pfx <+> digits) (TYPE . cast . drop len)
private
sup : Tokenizer TokenW
sup = match (some $ pred isSupDigit) (Sup . supToNat)
<|> match (is '^' <+> digits) (Sup . cast . drop 1)
private %inline
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
namespace Reserved
||| description of a reserved symbol
||| @ Word a reserved word (must not be followed by letters, digits, etc)
||| @ Sym a reserved symbol (must not be followed by symbolic chars)
||| @ Punc a character that doesn't show up in names (brackets, etc)
public export
data Reserved1 = Word String | Sym String | Punc Char
%runElab derive "Reserved1" [Eq, Ord, Show]
||| description of a token that might have unicode & ascii-only aliases
public export
data Reserved = Only Reserved1 | Or Reserved1 Reserved1
%runElab derive "Reserved" [Eq, Ord, Show]
public export
Sym1, Word1 : String -> Reserved
Sym1 = Only . Sym
Word1 = Only . Word
public export
Punc1 : Char -> Reserved
Punc1 = Only . Punc
public export
resString1 : Reserved1 -> String
resString1 (Punc x) = singleton x
resString1 (Word w) = w
resString1 (Sym s) = s
||| return the representative string for a token description. if there are
||| two, then it's the first one, which should be the full-unicode one
public export
resString : Reserved -> String
resString (Only r) = resString1 r
resString (r `Or` _) = resString1 r
private
resTokenizer1 : Reserved1 -> String -> Tokenizer TokenW
resTokenizer1 r str =
let res : String -> Token := const $ Reserved str in
case r of Word w => match (exact w <+> reject idContEnd) res
Sym s => match (exact s <+> reject symCont) res
Punc x => match (is x) res
||| match a reserved token
export
resTokenizer : Reserved -> Tokenizer TokenW
resTokenizer (Only r) = resTokenizer1 r (resString1 r)
resTokenizer (r `Or` s) =
resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)
||| reserved words & symbols.
||| the tokens recognised by ``a `Or` b`` will be `Reserved a`.
||| e.g. `=>` in the input (if not part of a longer name)
||| will be returned as `Reserved "⇒"`.
public export
reserved : List Reserved
reserved =
[Punc1 '(', Punc1 ')', Punc1 '[', Punc1 ']', Punc1 '{', Punc1 '}',
Punc1 ',', Punc1 ';',
Sym1 "@",
Sym1 ":",
Sym "" `Or` Sym "=>",
Sym "" `Or` Sym "->",
Sym "×" `Or` Sym "**",
Sym "" `Or` Sym "==",
Sym "" `Or` Sym "::",
Punc1 '.',
Word1 "case",
Word1 "case0", Word1 "case1",
Word "caseω" `Or` Word "case#",
Word1 "return",
Word1 "of",
Word1 "fst", Word1 "snd",
Word1 "_",
Word1 "Eq",
Word "λ" `Or` Word "fun",
Word "δ" `Or` Word "dfun",
Word "ω" `Or` Sym "#",
Sym "" `Or` Word "Type",
Word "" `Or` Word "Nat",
Word1 "zero", Word1 "succ",
Word1 "coe", Word1 "comp",
Word1 "def",
Word1 "def0",
Word "defω" `Or` Word "def#",
Sym1 "=",
Word1 "load",
Word1 "namespace"]
||| `IsReserved str` is true if `Reserved str` might actually show up in
||| the token stream
public export
IsReserved : String -> Type
IsReserved str = str `Elem` map resString reserved
export
tokens : Tokenizer TokenW
tokens = choice $
map skip [pred isWhitespace,
lineComment (exact "--" <+> reject symCont),
blockComment (exact "{-") (exact "-}")] <+>
[universe] <+> -- ★ᵢ takes precedence over bare ★
map resTokenizer reserved <+>
[sup, nat, string, tag, name]
export
lex : String -> Either Error (List (WithBounds Token))
lex str =
let (res, reason, line, col, str) = lex tokens str in
case reason of
EndInput => Right $ mapMaybe sequence res
_ => Left $ Err {reason, line, col, char = index 0 str}