quox/lib/Quox/Lexer.idr

255 lines
6.7 KiB
Idris
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

module Quox.Lexer
import Quox.CharExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
import public Data.String -- for singleton to reduce in IsReserved
import public Data.List.Elem
import public Text.Lexer
import public Text.Lexer.Tokenizer
import Derive.Prelude
%hide TT.Name
%default total
%language ElabReflection
||| @ Reserved reserved token
||| @ Name name, possibly qualified
||| @ Nat nat literal
||| @ String string literal
||| @ Tag tag literal
||| @ TYPE "Type" or "★" with subscript
public export
data Token =
Reserved String
| Name Name
| Nat Nat
| Str String
| Tag String
| TYPE Nat
%runElab derive "Token" [Eq, Ord, Show]
-- token or whitespace
public export
0 TokenW : Type
TokenW = Maybe Token
public export
record Error where
constructor Err
reason : StopReason
line, col : Int
||| `Nothing` if the error is at the end of the input
char : Maybe Char
%runElab derive "StopReason" [Eq, Ord, Show]
%runElab derive "Error" [Eq, Ord, Show]
private
skip : Lexer -> Tokenizer TokenW
skip t = match t $ const Nothing
private
match : Lexer -> (String -> Token) -> Tokenizer TokenW
match t f = Tokenizer.match t (Just . f)
%hide Tokenizer.match
export %inline
syntaxChars : List Char
syntaxChars = ['(', ')', '[', ']', '{', '}', '"', '\'', ',', '.', ';']
private
stra, isSymCont : Char -> Bool
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
isSymCont c = c == '\'' || isSymStart c
private
idStart, idCont, idEnd, idContEnd : Lexer
idStart = pred isIdStart
idCont = pred isIdCont
idEnd = pred $ \c => c `elem` unpack "?!#"
idContEnd = idCont <|> idEnd
private
symStart, symCont : Lexer
symStart = pred isSymStart
symCont = pred isSymCont
private
baseNameL : Lexer
baseNameL = idStart <+> many idCont <+> many idEnd
<|> symStart <+> many symCont
private
nameL : Lexer
nameL = baseNameL <+> many (is '.' <+> baseNameL)
private
name : Tokenizer TokenW
name = match nameL $ Name . fromList . split (== '.') . normalizeNfc
||| [todo] escapes other than `\"` and (accidentally) `\\`
export
fromStringLit : String -> String
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
go : List Char -> List Char
go [] = []
go ['\\'] = ['\\'] -- i guess???
go ('\\' :: c :: cs) = c :: go cs
go (c :: cs) = c :: go cs
private
string : Tokenizer TokenW
string = match stringLit (Str . fromStringLit)
private
nat : Tokenizer TokenW
nat = match (some (range '0' '9')) (Nat . cast)
private
tag : Tokenizer TokenW
tag = match (is '\'' <+> nameL) (Tag . drop 1)
<|> match (is '\'' <+> stringLit) (Tag . fromStringLit . drop 1)
private %inline
fromSub : Char -> Char
fromSub c = case c of
'' => '0'; '' => '1'; '' => '2'; '' => '3'; '' => '4'
'' => '5'; '' => '6'; '' => '7'; '' => '8'; '' => '9'; _ => c
private %inline
subToNat : String -> Nat
subToNat = cast . pack . map fromSub . unpack
private
universe : Tokenizer TokenW
universe = universeWith "" <|> universeWith "Type" where
universeWith : String -> Tokenizer TokenW
universeWith pfx =
let len = length pfx in
match (exact pfx <+> some (range '0' '9'))
(TYPE . cast . drop len) <|>
match (exact pfx <+> some (range '' ''))
(TYPE . subToNat . drop len)
private %inline
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
namespace Reserved
||| description of a reserved symbol
||| @ Word a reserved word (must not be followed by letters, digits, etc)
||| @ Sym a reserved symbol (must not be followed by symbolic chars)
||| @ Punc a character that doesn't show up in names (brackets, etc)
public export
data Reserved1 = Word String | Sym String | Punc Char
%runElab derive "Reserved1" [Eq, Ord, Show]
||| description of a token that might have unicode & ascii-only aliases
public export
data Reserved = Only Reserved1 | Or Reserved1 Reserved1
%runElab derive "Reserved" [Eq, Ord, Show]
public export
Sym1, Word1 : String -> Reserved
Sym1 = Only . Sym
Word1 = Only . Word
public export
Punc1 : Char -> Reserved
Punc1 = Only . Punc
public export
resString1 : Reserved1 -> String
resString1 (Punc x) = singleton x
resString1 (Word w) = w
resString1 (Sym s) = s
||| return the representative string for a token description. if there are
||| two, then it's the first one, which should be the full-unicode one
public export
resString : Reserved -> String
resString (Only r) = resString1 r
resString (r `Or` _) = resString1 r
private
resTokenizer1 : Reserved1 -> String -> Tokenizer TokenW
resTokenizer1 r str =
let res : String -> Token := const $ Reserved str in
case r of Word w => match (exact w <+> reject idContEnd) res
Sym s => match (exact s <+> reject symCont) res
Punc x => match (is x) res
||| match a reserved token
export
resTokenizer : Reserved -> Tokenizer TokenW
resTokenizer (Only r) = resTokenizer1 r (resString1 r)
resTokenizer (r `Or` s) =
resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)
||| reserved words & symbols.
||| the tokens recognised by ``a `Or` b`` will be `Reserved a`.
||| e.g. `=>` in the input (if not part of a longer name)
||| will be returned as `Reserved "⇒"`.
public export
reserved : List Reserved
reserved =
[Punc1 '(', Punc1 ')', Punc1 '[', Punc1 ']', Punc1 '{', Punc1 '}',
Punc1 ',', Punc1 ';',
Sym1 "@",
Sym1 ":",
Sym "" `Or` Sym "=>",
Sym "" `Or` Sym "->",
Sym "×" `Or` Sym "**",
Sym "" `Or` Sym "==",
Sym "" `Or` Sym "::",
Sym "·" `Or` Punc '.',
Word1 "case",
Word1 "case1",
Word "caseω" `Or` Word "case#",
Word1 "return",
Word1 "of",
Word1 "_",
Word1 "Eq",
Word "λ" `Or` Word "fun",
Word "δ" `Or` Word "dfun",
Word "ω" `Or` Sym "#",
Sym "" `Or` Word "Type",
Word1 "def",
Word1 "def0",
Word "defω" `Or` Word "def#",
Sym "" `Or` Sym ":="]
||| `IsReserved str` is true if `Reserved str` might actually show up in
||| the token stream
public export
IsReserved : String -> Type
IsReserved str = str `Elem` map resString reserved
export
tokens : Tokenizer TokenW
tokens = choice $
map skip [pred isWhitespace,
lineComment (exact "--" <+> reject symCont),
blockComment (exact "{-") (exact "-}")] <+>
[universe] <+> -- ★ᵢ takes precedence over bare ★
map resTokenizer reserved <+>
[nat, string, tag, name]
export
lex : String -> Either Error (List (WithBounds Token))
lex str =
let (res, reason, line, col, str) = lex tokens str in
case reason of
EndInput => Right $ mapMaybe sequence res
_ => Left $ Err {reason, line, col, char = index 0 str}