lexer
This commit is contained in:
parent
cacb3225a2
commit
dbe248be9a
6 changed files with 357 additions and 13 deletions
|
@ -1,4 +1,4 @@
|
||||||
module Quox.Unicode
|
module Quox.CharExtra
|
||||||
|
|
||||||
import Generics.Derive
|
import Generics.Derive
|
||||||
|
|
||||||
|
@ -123,11 +123,13 @@ namespace Char
|
||||||
|
|
||||||
export
|
export
|
||||||
isIdStart : Char -> Bool
|
isIdStart : Char -> Bool
|
||||||
isIdStart ch = isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9')
|
isIdStart ch =
|
||||||
|
ch == '_' || isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9')
|
||||||
|
|
||||||
export
|
export
|
||||||
isIdCont : Char -> Bool
|
isIdCont : Char -> Bool
|
||||||
isIdCont ch = isIdStart ch || ch == '\'' || isMark ch || isNumber ch
|
isIdCont ch =
|
||||||
|
isIdStart ch || ch == '\'' || isMark ch || isNumber ch
|
||||||
|
|
||||||
export
|
export
|
||||||
isIdConnector : Char -> Bool
|
isIdConnector : Char -> Bool
|
||||||
|
@ -136,8 +138,7 @@ isIdConnector ch = genCat ch == Punctuation Connector
|
||||||
|
|
||||||
export
|
export
|
||||||
isSymChar : Char -> Bool
|
isSymChar : Char -> Bool
|
||||||
isSymChar ch =
|
isSymChar ch = case genCat ch of
|
||||||
case genCat ch of
|
|
||||||
Symbol _ => True
|
Symbol _ => True
|
||||||
Punctuation Dash => True
|
Punctuation Dash => True
|
||||||
Punctuation Other => True
|
Punctuation Other => True
|
183
lib/Quox/Lexer.idr
Normal file
183
lib/Quox/Lexer.idr
Normal file
|
@ -0,0 +1,183 @@
|
||||||
|
module Quox.Lexer
|
||||||
|
|
||||||
|
import Quox.CharExtra
|
||||||
|
import Quox.Name
|
||||||
|
import Data.String.Extra
|
||||||
|
import Data.SortedMap
|
||||||
|
import Text.Lexer
|
||||||
|
import Text.Lexer.Tokenizer
|
||||||
|
import Generics.Derive
|
||||||
|
%hide TT.Name
|
||||||
|
|
||||||
|
|
||||||
|
%default total
|
||||||
|
%language ElabReflection
|
||||||
|
|
||||||
|
|
||||||
|
||| @ R reserved token
|
||||||
|
||| @ I identifier
|
||||||
|
||| @ N nat literal
|
||||||
|
||| @ S string literal
|
||||||
|
||| @ T tag literal
|
||||||
|
||| @ TYPE "Type" or "★" with subscript
|
||||||
|
public export
|
||||||
|
data Token = R String | I Name | N Nat | S String | T String | TYPE Nat
|
||||||
|
%runElab derive "Token" [Generic, Meta, Eq, Ord, DecEq, Show]
|
||||||
|
|
||||||
|
-- token or whitespace
|
||||||
|
private
|
||||||
|
0 TokenW : Type
|
||||||
|
TokenW = Maybe Token
|
||||||
|
|
||||||
|
|
||||||
|
public export
|
||||||
|
record Error where
|
||||||
|
constructor Err
|
||||||
|
reason : StopReason
|
||||||
|
line, col : Int
|
||||||
|
||| `Nothing` if the error is at the end of the input
|
||||||
|
char : Maybe Char
|
||||||
|
%runElab derive "StopReason" [Generic, Meta, Eq, Ord, DecEq, Show]
|
||||||
|
%runElab derive "Error" [Generic, Meta, Eq, Ord, DecEq, Show]
|
||||||
|
|
||||||
|
|
||||||
|
skip : Lexer -> Tokenizer TokenW
|
||||||
|
skip t = match t $ const Nothing
|
||||||
|
|
||||||
|
export
|
||||||
|
syntaxChars : List Char
|
||||||
|
syntaxChars = ['(', ')', '[', ']', '{', '}', '`', '"', '\'', ',', '.']
|
||||||
|
|
||||||
|
export
|
||||||
|
isSymStart, isSymCont : Char -> Bool
|
||||||
|
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
|
||||||
|
isSymCont c = c == '\'' || isSymStart c
|
||||||
|
|
||||||
|
export
|
||||||
|
idStart, idCont, idEnd, idContEnd, symStart, symCont : Lexer
|
||||||
|
idStart = pred isIdStart
|
||||||
|
idCont = pred isIdCont
|
||||||
|
idEnd = pred $ \c => c `elem` unpack "?!#"
|
||||||
|
idContEnd = idCont <|> idEnd
|
||||||
|
symStart = pred isSymStart
|
||||||
|
symCont = pred isSymCont
|
||||||
|
|
||||||
|
private %inline
|
||||||
|
resVal : String -> a -> Maybe Token
|
||||||
|
resVal str = const $ Just $ R str
|
||||||
|
|
||||||
|
export
|
||||||
|
resWordL, resSymL, syntaxL : String -> Lexer
|
||||||
|
resWordL str = exact str <+> reject idContEnd
|
||||||
|
resSymL str = exact str <+> reject symCont
|
||||||
|
syntaxL str = exact str
|
||||||
|
|
||||||
|
export
|
||||||
|
resWord, resSym, syntax : String -> Tokenizer TokenW
|
||||||
|
resWord str = match (resWordL str) (resVal str)
|
||||||
|
resSym str = match (resSymL str) (resVal str)
|
||||||
|
syntax str = match (syntaxL str) (resVal str)
|
||||||
|
|
||||||
|
-- return value contains unicode version
|
||||||
|
export
|
||||||
|
uresWord, uresSym, usyntax : (unicode, ascii : String) -> Tokenizer TokenW
|
||||||
|
uresWord u a = match (resWordL u <|> resWordL a) (resVal u)
|
||||||
|
uresSym u a = match (resSymL u <|> resSymL a) (resVal u)
|
||||||
|
usyntax u a = match (exact u <|> exact a) (resVal u)
|
||||||
|
|
||||||
|
export
|
||||||
|
alphaName, symName, baseNameL : Lexer
|
||||||
|
alphaName = idStart <+> many idCont <+> many idEnd
|
||||||
|
symName = symStart <+> many symCont
|
||||||
|
baseNameL = symName <|> alphaName
|
||||||
|
|
||||||
|
export
|
||||||
|
baseName : Tokenizer BaseName
|
||||||
|
baseName = match baseNameL UN
|
||||||
|
|
||||||
|
private
|
||||||
|
toName : String -> Name
|
||||||
|
toName = fromList . split (== '.')
|
||||||
|
|
||||||
|
export
|
||||||
|
name : Tokenizer TokenW
|
||||||
|
name = match (baseNameL <+> many (is '.' <+> baseNameL))
|
||||||
|
(Just . I . toName . normalizeNfc)
|
||||||
|
|
||||||
|
-- [todo] escapes other than \" and (accidentally) \\
|
||||||
|
export
|
||||||
|
fromStringLit : String -> String
|
||||||
|
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
|
||||||
|
go : List Char -> List Char
|
||||||
|
go [] = []
|
||||||
|
go ['\\'] = ['\\'] -- i guess???
|
||||||
|
go ('\\' :: c :: cs) = c :: go cs
|
||||||
|
go (c :: cs) = c :: go cs
|
||||||
|
|
||||||
|
export
|
||||||
|
string : Tokenizer TokenW
|
||||||
|
string = match stringLit (Just . S . fromStringLit)
|
||||||
|
|
||||||
|
export
|
||||||
|
nat : Tokenizer TokenW
|
||||||
|
nat = match (some (range '0' '9')) (Just . N . cast)
|
||||||
|
|
||||||
|
export
|
||||||
|
tag : Tokenizer TokenW
|
||||||
|
tag = match (is '`' <+> baseNameL) (Just . T . drop 1)
|
||||||
|
<|> match (is '`' <+> stringLit) (Just . T . fromStringLit . drop 1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private %inline
|
||||||
|
fromSub : Char -> Char
|
||||||
|
fromSub c = case c of
|
||||||
|
'₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4'
|
||||||
|
'₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c
|
||||||
|
|
||||||
|
private %inline
|
||||||
|
subToNat : String -> Nat
|
||||||
|
subToNat = cast . pack . map fromSub . unpack
|
||||||
|
|
||||||
|
|
||||||
|
export
|
||||||
|
universe : Tokenizer TokenW
|
||||||
|
universe = universeWith "★" <|> universeWith "Type" where
|
||||||
|
universeWith : String -> Tokenizer TokenW
|
||||||
|
universeWith pfx =
|
||||||
|
let len = length pfx in
|
||||||
|
match (exact pfx <+> some (range '0' '9'))
|
||||||
|
(Just . TYPE . cast . drop len) <|>
|
||||||
|
match (exact pfx <+> some (range '₀' '₉'))
|
||||||
|
(Just . TYPE . subToNat . drop len)
|
||||||
|
|
||||||
|
|
||||||
|
export
|
||||||
|
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
|
||||||
|
choice (t :: ts) = foldl (\a, b => a <|> b) t ts
|
||||||
|
|
||||||
|
|
||||||
|
export
|
||||||
|
tokens : Tokenizer TokenW
|
||||||
|
tokens = choice $
|
||||||
|
map skip [pred isWhitespace,
|
||||||
|
lineComment (exact "--" <+> reject symCont),
|
||||||
|
blockComment (exact "{-") (exact "-}")] <+>
|
||||||
|
map syntax ["(", ")", "[", "]", ",", "{", "}", "."] <+>
|
||||||
|
[match (exact "`{" <+> reject (is '-')) (resVal "`{")] <+>
|
||||||
|
map resSym ["@", ":"] <+>
|
||||||
|
map (uncurry uresSym)
|
||||||
|
[("·", "%"), ("→","->"), ("×", "**"), ("≡", "=="), ("∷", "::")] <+>
|
||||||
|
map resWord ["case", "case1", "caseω", "return", "of", "_"] <+>
|
||||||
|
map (uncurry uresWord) [("λ","fun"), ("δ","dfun"), ("caseω", "case#")] <+>
|
||||||
|
[resWord "ω", match (resSymL "#") (resVal "ω"),
|
||||||
|
universe, resSym "★", match (resWordL "Type") (resVal "★")] <+>
|
||||||
|
[nat, string, tag, name]
|
||||||
|
|
||||||
|
export
|
||||||
|
lex : String -> Either Error (List (WithBounds Token))
|
||||||
|
lex str =
|
||||||
|
let (res, reason, line, col, str) = lex tokens str in
|
||||||
|
case reason of
|
||||||
|
EndInput => Right $ mapMaybe sequence res
|
||||||
|
_ => Left $ Err {reason, line, col, char = index 0 str}
|
|
@ -34,7 +34,7 @@ record Name where
|
||||||
constructor MakeName
|
constructor MakeName
|
||||||
mods : SnocList String
|
mods : SnocList String
|
||||||
base : BaseName
|
base : BaseName
|
||||||
%runElab derive "Name" [Generic, Meta, Eq, Ord]
|
%runElab derive "Name" [Generic, Meta, Eq, Ord, DecEq]
|
||||||
|
|
||||||
export
|
export
|
||||||
Show Name where
|
Show Name where
|
||||||
|
@ -53,3 +53,10 @@ unq = MakeName [<]
|
||||||
export
|
export
|
||||||
toDots : Name -> String
|
toDots : Name -> String
|
||||||
toDots x = fastConcat $ cast $ map (<+> ".") x.mods :< baseStr x.base
|
toDots x = fastConcat $ cast $ map (<+> ".") x.mods :< baseStr x.base
|
||||||
|
|
||||||
|
export
|
||||||
|
fromList : List1 String -> Name
|
||||||
|
fromList (x ::: xs) = go [<] x xs where
|
||||||
|
go : SnocList String -> String -> List String -> Name
|
||||||
|
go mods x [] = MakeName mods (UN x)
|
||||||
|
go mods x (y :: ys) = go (mods :< x) y ys
|
||||||
|
|
|
@ -10,8 +10,8 @@ depends = base, contrib, elab-util, sop, snocvect
|
||||||
modules =
|
modules =
|
||||||
Quox.NatExtra,
|
Quox.NatExtra,
|
||||||
Quox.BoolExtra,
|
Quox.BoolExtra,
|
||||||
|
Quox.CharExtra,
|
||||||
Quox.Decidable,
|
Quox.Decidable,
|
||||||
Quox.Unicode,
|
|
||||||
Quox.No,
|
Quox.No,
|
||||||
Quox.Pretty,
|
Quox.Pretty,
|
||||||
Quox.Syntax,
|
Quox.Syntax,
|
||||||
|
@ -34,4 +34,5 @@ modules =
|
||||||
Quox.Equal,
|
Quox.Equal,
|
||||||
Quox.Name,
|
Quox.Name,
|
||||||
Quox.Typing,
|
Quox.Typing,
|
||||||
Quox.Typechecker
|
Quox.Typechecker,
|
||||||
|
Quox.Lexer
|
||||||
|
|
|
@ -4,6 +4,7 @@ import TAP
|
||||||
import Tests.Reduce
|
import Tests.Reduce
|
||||||
import Tests.Equal
|
import Tests.Equal
|
||||||
import Tests.Typechecker
|
import Tests.Typechecker
|
||||||
|
import Tests.Lexer
|
||||||
import System
|
import System
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,7 +12,8 @@ allTests : List Test
|
||||||
allTests = [
|
allTests = [
|
||||||
Reduce.tests,
|
Reduce.tests,
|
||||||
Equal.tests,
|
Equal.tests,
|
||||||
Typechecker.tests
|
Typechecker.tests,
|
||||||
|
Lexer.tests
|
||||||
]
|
]
|
||||||
|
|
||||||
main : IO ()
|
main : IO ()
|
||||||
|
|
150
tests/Tests/Lexer.idr
Normal file
150
tests/Tests/Lexer.idr
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
module Tests.Lexer
|
||||||
|
|
||||||
|
import Quox.Name
|
||||||
|
import Quox.Lexer
|
||||||
|
import Text.Lexer.Tokenizer
|
||||||
|
import TAP
|
||||||
|
|
||||||
|
|
||||||
|
private
|
||||||
|
data Failure = LexError Lexer.Error
|
||||||
|
| WrongLex (List Token) (List Token)
|
||||||
|
| ExpectedFail (List Token)
|
||||||
|
|
||||||
|
private
|
||||||
|
ToInfo Failure where
|
||||||
|
toInfo (LexError err) =
|
||||||
|
[("type", "LexError"),
|
||||||
|
("info", show err)]
|
||||||
|
toInfo (WrongLex want got) =
|
||||||
|
[("type", "WrongLex"),
|
||||||
|
("want", show want),
|
||||||
|
("got", show got)]
|
||||||
|
toInfo (ExpectedFail got) =
|
||||||
|
[("type", "ExpectedFail"),
|
||||||
|
("got", show got)]
|
||||||
|
|
||||||
|
|
||||||
|
private
|
||||||
|
denewline : String -> String
|
||||||
|
denewline = pack . map (\case '\n' => ''; c => c) . unpack
|
||||||
|
|
||||||
|
private
|
||||||
|
lexes : String -> List Token -> Test
|
||||||
|
lexes str toks = test "「\{denewline str}」" {e = Failure} $
|
||||||
|
case lex str of
|
||||||
|
Left err => throwError $ LexError err
|
||||||
|
Right res =>
|
||||||
|
let res = map val res in
|
||||||
|
unless (toks == res) $ throwError $ WrongLex toks res
|
||||||
|
|
||||||
|
private
|
||||||
|
lexFail : String -> Test
|
||||||
|
lexFail str = test "「\{denewline str}」 # fails" {e = Failure} $
|
||||||
|
case lex str of
|
||||||
|
Left err => pure ()
|
||||||
|
Right res => throwError $ ExpectedFail $ map val res
|
||||||
|
|
||||||
|
export
|
||||||
|
tests : Test
|
||||||
|
tests = "lexer" :- [
|
||||||
|
"comments" :- [
|
||||||
|
lexes "" [],
|
||||||
|
lexes " " [],
|
||||||
|
lexes "-- line comment" [],
|
||||||
|
lexes "name -- line comment" [I "name"],
|
||||||
|
lexes "-- line comment\nnameBetween -- and another" [I "nameBetween"],
|
||||||
|
lexes "{- block comment -}" [],
|
||||||
|
lexes "{- {- nested -} block comment -}" []
|
||||||
|
],
|
||||||
|
|
||||||
|
"identifiers & keywords" :- [
|
||||||
|
lexes "abc" [I "abc"],
|
||||||
|
lexes "abc def" [I "abc", I "def"],
|
||||||
|
lexes "abc{-comment-}def" [I "abc", I "def"],
|
||||||
|
lexes "λ" [R "λ"],
|
||||||
|
lexes "fun" [R "λ"],
|
||||||
|
lexes "δ" [R "δ"],
|
||||||
|
lexes "dfun" [R "δ"],
|
||||||
|
lexes "funky" [I "funky"],
|
||||||
|
lexes "δελτα" [I "δελτα"],
|
||||||
|
lexes "a.b.c.d.e" [I $ MakeName [< "a","b","c","d"] "e"],
|
||||||
|
lexes "normalïse" [I "normalïse"],
|
||||||
|
-- ↑ replace i + combining ¨ with precomposed ï
|
||||||
|
|
||||||
|
todo "check for reserved words in a qname",
|
||||||
|
-- lexes "abc.fun.def" [I "abc", R ".", R "λ", R ".", I "def"],
|
||||||
|
|
||||||
|
lexes "+" [I "+"],
|
||||||
|
lexes "*" [I "*"],
|
||||||
|
lexes "**" [R "×"],
|
||||||
|
lexes "***" [I "***"],
|
||||||
|
lexes "+**" [I "+**"],
|
||||||
|
lexes "+.+.+" [I $ MakeName [< "+", "+"] "+"],
|
||||||
|
lexes "a.+" [I $ MakeName [< "a"] "+"],
|
||||||
|
lexes "+.a" [I $ MakeName [< "+"] "a"],
|
||||||
|
|
||||||
|
lexes "+a" [I "+", I "a"],
|
||||||
|
|
||||||
|
lexes "x." [I "x", R "."],
|
||||||
|
lexes "&." [I "&", R "."],
|
||||||
|
lexes ".x" [R ".", I "x"],
|
||||||
|
lexes "a.b.c." [I $ MakeName [< "a", "b"] "c", R "."],
|
||||||
|
|
||||||
|
lexes "case" [R "case"],
|
||||||
|
lexes "caseω" [R "caseω"],
|
||||||
|
lexes "case#" [R "caseω"],
|
||||||
|
lexes "case1" [R "case1"],
|
||||||
|
lexes "case0" [I "case0"],
|
||||||
|
lexes "case##" [I "case##"],
|
||||||
|
|
||||||
|
lexes "_" [R "_"],
|
||||||
|
lexes "_a" [I "_a"],
|
||||||
|
lexes "a_" [I "a_"],
|
||||||
|
|
||||||
|
lexes "a'" [I "a'"],
|
||||||
|
lexes "+'" [I "+'"],
|
||||||
|
lexFail "'+"
|
||||||
|
],
|
||||||
|
|
||||||
|
"syntax characters" :- [
|
||||||
|
lexes "()" [R "(", R ")"],
|
||||||
|
lexes "(a)" [R "(", I "a", R ")"],
|
||||||
|
lexes "(^)" [R "(", I "^", R ")"],
|
||||||
|
lexes "`{a,b}" [R "`{", I "a", R ",", I "b", R "}"],
|
||||||
|
lexes "`{+,-}" [R "`{", I "+", R ",", I "-", R "}"],
|
||||||
|
lexFail "` {}",
|
||||||
|
-- [todo] should this be lexed as "`{", "-", "mid", etc?
|
||||||
|
lexFail "`{-mid token comment-}{}"
|
||||||
|
],
|
||||||
|
|
||||||
|
"tags" :- [
|
||||||
|
lexes "`a" [T "a"],
|
||||||
|
lexes "`abc" [T "abc"],
|
||||||
|
lexes "`+" [T "+"],
|
||||||
|
lexes "`$$$" [T "$$$"],
|
||||||
|
lexes #"`"multi word tag""# [T "multi word tag"],
|
||||||
|
lexFail "`",
|
||||||
|
lexFail "``"
|
||||||
|
],
|
||||||
|
|
||||||
|
"strings" :- [
|
||||||
|
lexes #" "" "# [S ""],
|
||||||
|
lexes #" "abc" "# [S "abc"],
|
||||||
|
lexes #" "\"" "# [S "\""],
|
||||||
|
lexes #" "\\" "# [S "\\"],
|
||||||
|
lexes #" "\\\"" "# [S "\\\""],
|
||||||
|
todo "other escapes"
|
||||||
|
],
|
||||||
|
|
||||||
|
"universes" :- [
|
||||||
|
lexes "Type0" [TYPE 0],
|
||||||
|
lexes "Type₀" [TYPE 0],
|
||||||
|
lexes "Type9999999" [TYPE 9999999],
|
||||||
|
lexes "★₀" [TYPE 0],
|
||||||
|
lexes "★₆₉" [TYPE 69],
|
||||||
|
lexes "★4" [TYPE 4],
|
||||||
|
lexes "Type" [R "★"],
|
||||||
|
lexes "★" [R "★"]
|
||||||
|
]
|
||||||
|
]
|
Loading…
Reference in a new issue