quox/lib/Quox/Lexer.idr

256 lines
6.7 KiB
Idris
Raw Normal View History

2023-02-28 14:51:54 -05:00
module Quox.Lexer
import Quox.CharExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
2023-03-06 06:04:43 -05:00
import public Data.String -- for singleton to reduce in IsReserved
2023-03-04 15:02:51 -05:00
import public Data.List.Elem
import public Text.Lexer
import public Text.Lexer.Tokenizer
2023-03-02 13:52:32 -05:00
import Derive.Prelude
2023-02-28 14:51:54 -05:00
%hide TT.Name
%default total
%language ElabReflection
||| @ Reserved reserved token
||| @ Name name, possibly qualified
||| @ Nat nat literal
||| @ String string literal
||| @ Tag tag literal
2023-02-28 14:51:54 -05:00
||| @ TYPE "Type" or "★" with subscript
public export
data Token =
Reserved String
| Name Name
| Nat Nat
| Str String
| Tag String
| TYPE Nat
2023-03-02 13:52:32 -05:00
%runElab derive "Token" [Eq, Ord, Show]
2023-02-28 14:51:54 -05:00
-- token or whitespace
2023-03-04 15:02:51 -05:00
public export
2023-02-28 14:51:54 -05:00
0 TokenW : Type
TokenW = Maybe Token
public export
record Error where
constructor Err
reason : StopReason
line, col : Int
||| `Nothing` if the error is at the end of the input
char : Maybe Char
2023-03-02 13:52:32 -05:00
%runElab derive "StopReason" [Eq, Ord, Show]
%runElab derive "Error" [Eq, Ord, Show]
2023-02-28 14:51:54 -05:00
2023-03-04 15:02:51 -05:00
private
2023-02-28 14:51:54 -05:00
skip : Lexer -> Tokenizer TokenW
skip t = match t $ const Nothing
2023-03-04 15:02:51 -05:00
private
match : Lexer -> (String -> Token) -> Tokenizer TokenW
match t f = Tokenizer.match t (Just . f)
%hide Tokenizer.match
export %inline
2023-02-28 14:51:54 -05:00
syntaxChars : List Char
2023-03-04 15:02:51 -05:00
syntaxChars = ['(', ')', '[', ']', '{', '}', '"', '\'', ',', '.', ';']
2023-02-28 14:51:54 -05:00
2023-03-04 15:02:51 -05:00
private
stra, isSymCont : Char -> Bool
2023-02-28 14:51:54 -05:00
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
isSymCont c = c == '\'' || isSymStart c
2023-03-04 15:02:51 -05:00
private
idStart, idCont, idEnd, idContEnd : Lexer
2023-02-28 14:51:54 -05:00
idStart = pred isIdStart
idCont = pred isIdCont
idEnd = pred $ \c => c `elem` unpack "?!#"
idContEnd = idCont <|> idEnd
2023-03-04 15:02:51 -05:00
private
symStart, symCont : Lexer
2023-02-28 14:51:54 -05:00
symStart = pred isSymStart
symCont = pred isSymCont
2023-03-04 15:02:51 -05:00
private
baseNameL : Lexer
baseNameL = idStart <+> many idCont <+> many idEnd
<|> symStart <+> many symCont
2023-02-28 14:51:54 -05:00
private
2023-03-04 15:02:51 -05:00
nameL : Lexer
nameL = baseNameL <+> many (is '.' <+> baseNameL)
2023-02-28 14:51:54 -05:00
2023-03-04 15:02:51 -05:00
private
2023-02-28 14:51:54 -05:00
name : Tokenizer TokenW
name = match nameL $ Name . fromList . split (== '.') . normalizeNfc
2023-02-28 14:51:54 -05:00
2023-03-04 15:02:51 -05:00
||| [todo] escapes other than `\"` and (accidentally) `\\`
2023-02-28 14:51:54 -05:00
export
fromStringLit : String -> String
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
go : List Char -> List Char
go [] = []
go ['\\'] = ['\\'] -- i guess???
go ('\\' :: c :: cs) = c :: go cs
go (c :: cs) = c :: go cs
2023-03-04 15:02:51 -05:00
private
2023-02-28 14:51:54 -05:00
string : Tokenizer TokenW
string = match stringLit (Str . fromStringLit)
2023-02-28 14:51:54 -05:00
2023-03-04 15:02:51 -05:00
private
2023-02-28 14:51:54 -05:00
nat : Tokenizer TokenW
nat = match (some (range '0' '9')) (Nat . cast)
2023-02-28 14:51:54 -05:00
2023-03-04 15:02:51 -05:00
private
2023-02-28 14:51:54 -05:00
tag : Tokenizer TokenW
tag = match (is '\'' <+> nameL) (Tag . drop 1)
<|> match (is '\'' <+> stringLit) (Tag . fromStringLit . drop 1)
2023-02-28 14:51:54 -05:00
private %inline
fromSub : Char -> Char
fromSub c = case c of
'' => '0'; '' => '1'; '' => '2'; '' => '3'; '' => '4'
'' => '5'; '' => '6'; '' => '7'; '' => '8'; '' => '9'; _ => c
private %inline
subToNat : String -> Nat
subToNat = cast . pack . map fromSub . unpack
2023-03-04 15:02:51 -05:00
private
2023-02-28 14:51:54 -05:00
universe : Tokenizer TokenW
universe = universeWith "" <|> universeWith "Type" where
universeWith : String -> Tokenizer TokenW
universeWith pfx =
let len = length pfx in
match (exact pfx <+> some (range '0' '9'))
2023-03-04 15:02:51 -05:00
(TYPE . cast . drop len) <|>
2023-02-28 14:51:54 -05:00
match (exact pfx <+> some (range '' ''))
2023-03-04 15:02:51 -05:00
(TYPE . subToNat . drop len)
2023-02-28 14:51:54 -05:00
2023-03-04 15:02:51 -05:00
private %inline
2023-02-28 14:51:54 -05:00
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
namespace Reserved
2023-03-04 15:02:51 -05:00
||| description of a reserved symbol
||| @ Word a reserved word (must not be followed by letters, digits, etc)
||| @ Sym a reserved symbol (must not be followed by symbolic chars)
||| @ Punc a character that doesn't show up in names (brackets, etc)
2023-03-04 15:02:51 -05:00
public export
data Reserved1 = Word String | Sym String | Punc Char
%runElab derive "Reserved1" [Eq, Ord, Show]
2023-03-04 15:02:51 -05:00
||| description of a token that might have unicode & ascii-only aliases
public export
data Reserved = Only Reserved1 | Or Reserved1 Reserved1
%runElab derive "Reserved" [Eq, Ord, Show]
2023-03-04 15:02:51 -05:00
public export
Sym1, Word1 : String -> Reserved
Sym1 = Only . Sym
Word1 = Only . Word
2023-03-04 15:02:51 -05:00
public export
Punc1 : Char -> Reserved
Punc1 = Only . Punc
2023-03-04 15:02:51 -05:00
public export
resString1 : Reserved1 -> String
resString1 (Punc x) = singleton x
resString1 (Word w) = w
resString1 (Sym s) = s
2023-03-04 15:02:51 -05:00
||| return the representative string for a token description. if there are
||| two, then it's the first one, which should be the full-unicode one
public export
resString : Reserved -> String
2023-03-04 15:02:51 -05:00
resString (Only r) = resString1 r
resString (r `Or` _) = resString1 r
private
resTokenizer1 : Reserved1 -> String -> Tokenizer TokenW
2023-03-04 15:02:51 -05:00
resTokenizer1 r str =
let res : String -> Token := const $ Reserved str in
case r of Word w => match (exact w <+> reject idContEnd) res
Sym s => match (exact s <+> reject symCont) res
Punc x => match (is x) res
2023-03-04 15:02:51 -05:00
||| match a reserved token
export
resTokenizer : Reserved -> Tokenizer TokenW
2023-03-04 15:02:51 -05:00
resTokenizer (Only r) = resTokenizer1 r (resString1 r)
resTokenizer (r `Or` s) =
resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)
||| reserved words & symbols.
||| the tokens recognised by ``a `Or` b`` will be `Reserved a`.
||| e.g. `=>` in the input (if not part of a longer name)
||| will be returned as `Reserved "⇒"`.
2023-03-04 15:02:51 -05:00
public export
reserved : List Reserved
2023-03-04 15:02:51 -05:00
reserved =
[Punc1 '(', Punc1 ')', Punc1 '[', Punc1 ']', Punc1 '{', Punc1 '}',
Punc1 ',', Punc1 ';',
Sym1 "@",
Sym1 ":",
Sym "" `Or` Sym "=>",
Sym "" `Or` Sym "->",
Sym "×" `Or` Sym "**",
Sym "" `Or` Sym "==",
Sym "" `Or` Sym "::",
Sym "·" `Or` Punc '.',
Word1 "case",
Word1 "case1",
Word "caseω" `Or` Word "case#",
Word1 "return",
Word1 "of",
Word1 "_",
Word1 "Eq",
Word "λ" `Or` Word "fun",
Word "δ" `Or` Word "dfun",
Word "ω" `Or` Sym "#",
Sym "" `Or` Word "Type",
Word1 "def",
Word1 "def0",
Word "defω" `Or` Word "def#",
Sym "" `Or` Sym ":="]
||| `IsReserved str` is true if `Reserved str` might actually show up in
2023-03-04 15:02:51 -05:00
||| the token stream
public export
IsReserved : String -> Type
IsReserved str = str `Elem` map resString reserved
2023-02-28 14:51:54 -05:00
export
tokens : Tokenizer TokenW
tokens = choice $
map skip [pred isWhitespace,
lineComment (exact "--" <+> reject symCont),
blockComment (exact "{-") (exact "-}")] <+>
2023-03-04 15:02:51 -05:00
[universe] <+> -- ★ᵢ takes precedence over bare ★
map resTokenizer reserved <+>
2023-02-28 14:51:54 -05:00
[nat, string, tag, name]
export
lex : String -> Either Error (List (WithBounds Token))
lex str =
let (res, reason, line, col, str) = lex tokens str in
case reason of
EndInput => Right $ mapMaybe sequence res
_ => Left $ Err {reason, line, col, char = index 0 str}