quox/lib/Quox/Lexer.idr
2023-03-04 21:35:09 +01:00

242 lines
6.2 KiB
Idris
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

module Quox.Lexer
import Quox.CharExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
import public Data.List.Elem
import public Text.Lexer
import public Text.Lexer.Tokenizer
import Derive.Prelude
%hide TT.Name
%default total
%language ElabReflection
||| @ R reserved token
||| @ I identifier
||| @ N nat literal
||| @ S string literal
||| @ T tag literal
||| @ TYPE "Type" or "★" with subscript
public export
data Token = R String | I Name | N Nat | S String | T String | TYPE Nat
%runElab derive "Token" [Eq, Ord, Show]
-- token or whitespace
public export
0 TokenW : Type
TokenW = Maybe Token
public export
record Error where
constructor Err
reason : StopReason
line, col : Int
||| `Nothing` if the error is at the end of the input
char : Maybe Char
%runElab derive "StopReason" [Eq, Ord, Show]
%runElab derive "Error" [Eq, Ord, Show]
private
skip : Lexer -> Tokenizer TokenW
skip t = match t $ const Nothing
private
match : Lexer -> (String -> Token) -> Tokenizer TokenW
match t f = Tokenizer.match t (Just . f)
%hide Tokenizer.match
export %inline
syntaxChars : List Char
syntaxChars = ['(', ')', '[', ']', '{', '}', '"', '\'', ',', '.', ';']
private
stra, isSymCont : Char -> Bool
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
isSymCont c = c == '\'' || isSymStart c
private
idStart, idCont, idEnd, idContEnd : Lexer
idStart = pred isIdStart
idCont = pred isIdCont
idEnd = pred $ \c => c `elem` unpack "?!#"
idContEnd = idCont <|> idEnd
private
symStart, symCont : Lexer
symStart = pred isSymStart
symCont = pred isSymCont
private
baseNameL : Lexer
baseNameL = idStart <+> many idCont <+> many idEnd
<|> symStart <+> many symCont
private
nameL : Lexer
nameL = baseNameL <+> many (is '.' <+> baseNameL)
private
name : Tokenizer TokenW
name = match nameL $ I . fromList . split (== '.') . normalizeNfc
||| [todo] escapes other than `\"` and (accidentally) `\\`
export
fromStringLit : String -> String
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
go : List Char -> List Char
go [] = []
go ['\\'] = ['\\'] -- i guess???
go ('\\' :: c :: cs) = c :: go cs
go (c :: cs) = c :: go cs
private
string : Tokenizer TokenW
string = match stringLit (S . fromStringLit)
private
nat : Tokenizer TokenW
nat = match (some (range '0' '9')) (N . cast)
private
tag : Tokenizer TokenW
tag = match (is '\'' <+> nameL) (T . drop 1)
<|> match (is '\'' <+> stringLit) (T . fromStringLit . drop 1)
private %inline
fromSub : Char -> Char
fromSub c = case c of
'' => '0'; '' => '1'; '' => '2'; '' => '3'; '' => '4'
'' => '5'; '' => '6'; '' => '7'; '' => '8'; '' => '9'; _ => c
private %inline
subToNat : String -> Nat
subToNat = cast . pack . map fromSub . unpack
private
universe : Tokenizer TokenW
universe = universeWith "" <|> universeWith "Type" where
universeWith : String -> Tokenizer TokenW
universeWith pfx =
let len = length pfx in
match (exact pfx <+> some (range '0' '9'))
(TYPE . cast . drop len) <|>
match (exact pfx <+> some (range '' ''))
(TYPE . subToNat . drop len)
private %inline
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
namespace Res
||| description of a reserved symbol
||| @ W a reserved word (must not be followed by letters, digits, etc)
||| @ S a reserved symbol (must not be followed by symbolic chars)
||| @ X a character that doesn't show up in names (brackets, etc)
public export
data Res1 = W String | S String | X Char
%runElab derive "Res1" [Eq, Ord, Show]
||| description of a token that might have unicode & ascii-only aliases
public export
data Res = Only Res1 | Or Res1 Res1
%runElab derive "Res" [Eq, Ord, Show]
public export
S1, W1 : String -> Res
S1 = Only . S
W1 = Only . W
public export
X1 : Char -> Res
X1 = Only . X
public export
resString1 : Res1 -> String
resString1 (X x) = singleton x
resString1 (W w) = w
resString1 (S s) = s
||| return the representative string for a token description. if there are
||| two, then it's the first one, which should be the full-unicode one
public export
resString : Res -> String
resString (Only r) = resString1 r
resString (r `Or` _) = resString1 r
private
resTokenizer1 : Res1 -> String -> Tokenizer TokenW
resTokenizer1 r str =
let res : String -> Token := const $ R str in
case r of W w => match (exact w <+> reject idContEnd) res
S s => match (exact s <+> reject symCont) res
X x => match (is x) res
||| match a reserved token
export
resTokenizer : Res -> Tokenizer TokenW
resTokenizer (Only r) = resTokenizer1 r (resString1 r)
resTokenizer (r `Or` s) =
resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)
||| reserved words & symbols.
||| the tokens recognised by ``a `Or` b`` will be `R a`. e.g. `=>` in the input
||| (if not part of a longer name) will be returned as `R "⇒"`.
public export
reserved : List Res
reserved =
[X1 '(', X1 ')', X1 '[', X1 ']', X1 '{', X1 '}', X1 ',', X1 ';',
S1 "@",
S1 ":",
S "" `Or` S "=>",
S "" `Or` S "->",
S "×" `Or` S "**",
S "" `Or` S "==",
S "" `Or` S "::",
S "·" `Or` X '.',
W1 "case",
W1 "case1",
W "caseω" `Or` W "case#",
W1 "return",
W1 "of",
W1 "_",
W1 "Eq",
W "λ" `Or` W "fun",
W "δ" `Or` W "dfun",
W "ω" `Or` S "#",
S "" `Or` W "Type"]
||| `IsReserved str` is true if `R str` might actually show up in
||| the token stream
public export
IsReserved : String -> Type
IsReserved str = str `Elem` map resString reserved
export
tokens : Tokenizer TokenW
tokens = choice $
map skip [pred isWhitespace,
lineComment (exact "--" <+> reject symCont),
blockComment (exact "{-") (exact "-}")] <+>
[universe] <+> -- ★ᵢ takes precedence over bare ★
map resTokenizer reserved <+>
[nat, string, tag, name]
export
lex : String -> Either Error (List (WithBounds Token))
lex str =
let (res, reason, line, col, str) = lex tokens str in
case reason of
EndInput => Right $ mapMaybe sequence res
_ => Left $ Err {reason, line, col, char = index 0 str}