This commit is contained in:
rhiannon morris 2023-02-28 20:51:54 +01:00
parent cacb3225a2
commit dbe248be9a
6 changed files with 357 additions and 13 deletions

View file

@ -1,4 +1,4 @@
module Quox.Unicode
module Quox.CharExtra
import Generics.Derive
@ -123,11 +123,13 @@ namespace Char
export
isIdStart : Char -> Bool
isIdStart ch = isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9')
isIdStart ch =
ch == '_' || isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9')
export
isIdCont : Char -> Bool
isIdCont ch = isIdStart ch || ch == '\'' || isMark ch || isNumber ch
isIdCont ch =
isIdStart ch || ch == '\'' || isMark ch || isNumber ch
export
isIdConnector : Char -> Bool
@ -136,12 +138,11 @@ isIdConnector ch = genCat ch == Punctuation Connector
export
isSymChar : Char -> Bool
isSymChar ch =
case genCat ch of
Symbol _ => True
Punctuation Dash => True
Punctuation Other => True
_ => False
isSymChar ch = case genCat ch of
Symbol _ => True
Punctuation Dash => True
Punctuation Other => True
_ => False
export
isWhitespace : Char -> Bool

183
lib/Quox/Lexer.idr Normal file
View file

@ -0,0 +1,183 @@
module Quox.Lexer
import Quox.CharExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
import Text.Lexer
import Text.Lexer.Tokenizer
import Generics.Derive
%hide TT.Name
%default total
%language ElabReflection
||| @ R reserved token
||| @ I identifier
||| @ N nat literal
||| @ S string literal
||| @ T tag literal
||| @ TYPE "Type" or "★" with subscript
public export
data Token = R String | I Name | N Nat | S String | T String | TYPE Nat
%runElab derive "Token" [Generic, Meta, Eq, Ord, DecEq, Show]
-- token or whitespace
private
0 TokenW : Type
TokenW = Maybe Token
public export
record Error where
constructor Err
reason : StopReason
line, col : Int
||| `Nothing` if the error is at the end of the input
char : Maybe Char
%runElab derive "StopReason" [Generic, Meta, Eq, Ord, DecEq, Show]
%runElab derive "Error" [Generic, Meta, Eq, Ord, DecEq, Show]
skip : Lexer -> Tokenizer TokenW
skip t = match t $ const Nothing
export
syntaxChars : List Char
syntaxChars = ['(', ')', '[', ']', '{', '}', '`', '"', '\'', ',', '.']
export
isSymStart, isSymCont : Char -> Bool
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
isSymCont c = c == '\'' || isSymStart c
export
idStart, idCont, idEnd, idContEnd, symStart, symCont : Lexer
idStart = pred isIdStart
idCont = pred isIdCont
idEnd = pred $ \c => c `elem` unpack "?!#"
idContEnd = idCont <|> idEnd
symStart = pred isSymStart
symCont = pred isSymCont
private %inline
resVal : String -> a -> Maybe Token
resVal str = const $ Just $ R str
export
resWordL, resSymL, syntaxL : String -> Lexer
resWordL str = exact str <+> reject idContEnd
resSymL str = exact str <+> reject symCont
syntaxL str = exact str
export
resWord, resSym, syntax : String -> Tokenizer TokenW
resWord str = match (resWordL str) (resVal str)
resSym str = match (resSymL str) (resVal str)
syntax str = match (syntaxL str) (resVal str)
-- return value contains unicode version
export
uresWord, uresSym, usyntax : (unicode, ascii : String) -> Tokenizer TokenW
uresWord u a = match (resWordL u <|> resWordL a) (resVal u)
uresSym u a = match (resSymL u <|> resSymL a) (resVal u)
usyntax u a = match (exact u <|> exact a) (resVal u)
export
alphaName, symName, baseNameL : Lexer
alphaName = idStart <+> many idCont <+> many idEnd
symName = symStart <+> many symCont
baseNameL = symName <|> alphaName
export
baseName : Tokenizer BaseName
baseName = match baseNameL UN
private
toName : String -> Name
toName = fromList . split (== '.')
export
name : Tokenizer TokenW
name = match (baseNameL <+> many (is '.' <+> baseNameL))
(Just . I . toName . normalizeNfc)
-- [todo] escapes other than \" and (accidentally) \\
export
fromStringLit : String -> String
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
go : List Char -> List Char
go [] = []
go ['\\'] = ['\\'] -- i guess???
go ('\\' :: c :: cs) = c :: go cs
go (c :: cs) = c :: go cs
export
string : Tokenizer TokenW
string = match stringLit (Just . S . fromStringLit)
export
nat : Tokenizer TokenW
nat = match (some (range '0' '9')) (Just . N . cast)
export
tag : Tokenizer TokenW
tag = match (is '`' <+> baseNameL) (Just . T . drop 1)
<|> match (is '`' <+> stringLit) (Just . T . fromStringLit . drop 1)
private %inline
fromSub : Char -> Char
fromSub c = case c of
'' => '0'; '' => '1'; '' => '2'; '' => '3'; '' => '4'
'' => '5'; '' => '6'; '' => '7'; '' => '8'; '' => '9'; _ => c
private %inline
subToNat : String -> Nat
subToNat = cast . pack . map fromSub . unpack
export
universe : Tokenizer TokenW
universe = universeWith "" <|> universeWith "Type" where
universeWith : String -> Tokenizer TokenW
universeWith pfx =
let len = length pfx in
match (exact pfx <+> some (range '0' '9'))
(Just . TYPE . cast . drop len) <|>
match (exact pfx <+> some (range '' ''))
(Just . TYPE . subToNat . drop len)
export
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
export
tokens : Tokenizer TokenW
tokens = choice $
map skip [pred isWhitespace,
lineComment (exact "--" <+> reject symCont),
blockComment (exact "{-") (exact "-}")] <+>
map syntax ["(", ")", "[", "]", ",", "{", "}", "."] <+>
[match (exact "`{" <+> reject (is '-')) (resVal "`{")] <+>
map resSym ["@", ":"] <+>
map (uncurry uresSym)
[("·", "%"), ("","->"), ("×", "**"), ("", "=="), ("", "::")] <+>
map resWord ["case", "case1", "caseω", "return", "of", "_"] <+>
map (uncurry uresWord) [("λ","fun"), ("δ","dfun"), ("caseω", "case#")] <+>
[resWord "ω", match (resSymL "#") (resVal "ω"),
universe, resSym "", match (resWordL "Type") (resVal "")] <+>
[nat, string, tag, name]
export
lex : String -> Either Error (List (WithBounds Token))
lex str =
let (res, reason, line, col, str) = lex tokens str in
case reason of
EndInput => Right $ mapMaybe sequence res
_ => Left $ Err {reason, line, col, char = index 0 str}

View file

@ -34,7 +34,7 @@ record Name where
constructor MakeName
mods : SnocList String
base : BaseName
%runElab derive "Name" [Generic, Meta, Eq, Ord]
%runElab derive "Name" [Generic, Meta, Eq, Ord, DecEq]
export
Show Name where
@ -53,3 +53,10 @@ unq = MakeName [<]
export
toDots : Name -> String
toDots x = fastConcat $ cast $ map (<+> ".") x.mods :< baseStr x.base
export
fromList : List1 String -> Name
fromList (x ::: xs) = go [<] x xs where
go : SnocList String -> String -> List String -> Name
go mods x [] = MakeName mods (UN x)
go mods x (y :: ys) = go (mods :< x) y ys

View file

@ -10,8 +10,8 @@ depends = base, contrib, elab-util, sop, snocvect
modules =
Quox.NatExtra,
Quox.BoolExtra,
Quox.CharExtra,
Quox.Decidable,
Quox.Unicode,
Quox.No,
Quox.Pretty,
Quox.Syntax,
@ -34,4 +34,5 @@ modules =
Quox.Equal,
Quox.Name,
Quox.Typing,
Quox.Typechecker
Quox.Typechecker,
Quox.Lexer