From dbe248be9ac159381db1ec0aeb4936becaab1504 Mon Sep 17 00:00:00 2001 From: rhiannon morris Date: Tue, 28 Feb 2023 20:51:54 +0100 Subject: [PATCH] lexer --- lib/Quox/{Unicode.idr => CharExtra.idr} | 19 +-- lib/Quox/Lexer.idr | 183 ++++++++++++++++++++++++ lib/Quox/Name.idr | 9 +- lib/quox-lib.ipkg | 5 +- tests/Tests.idr | 4 +- tests/Tests/Lexer.idr | 150 +++++++++++++++++++ 6 files changed, 357 insertions(+), 13 deletions(-) rename lib/Quox/{Unicode.idr => CharExtra.idr} (92%) create mode 100644 lib/Quox/Lexer.idr create mode 100644 tests/Tests/Lexer.idr diff --git a/lib/Quox/Unicode.idr b/lib/Quox/CharExtra.idr similarity index 92% rename from lib/Quox/Unicode.idr rename to lib/Quox/CharExtra.idr index 5cd337b..708261f 100644 --- a/lib/Quox/Unicode.idr +++ b/lib/Quox/CharExtra.idr @@ -1,4 +1,4 @@ -module Quox.Unicode +module Quox.CharExtra import Generics.Derive @@ -123,11 +123,13 @@ namespace Char export isIdStart : Char -> Bool -isIdStart ch = isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9') +isIdStart ch = + ch == '_' || isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9') export isIdCont : Char -> Bool -isIdCont ch = isIdStart ch || ch == '\'' || isMark ch || isNumber ch +isIdCont ch = + isIdStart ch || ch == '\'' || isMark ch || isNumber ch export isIdConnector : Char -> Bool @@ -136,12 +138,11 @@ isIdConnector ch = genCat ch == Punctuation Connector export isSymChar : Char -> Bool -isSymChar ch = - case genCat ch of - Symbol _ => True - Punctuation Dash => True - Punctuation Other => True - _ => False +isSymChar ch = case genCat ch of + Symbol _ => True + Punctuation Dash => True + Punctuation Other => True + _ => False export isWhitespace : Char -> Bool diff --git a/lib/Quox/Lexer.idr b/lib/Quox/Lexer.idr new file mode 100644 index 0000000..2b2cb9f --- /dev/null +++ b/lib/Quox/Lexer.idr @@ -0,0 +1,183 @@ +module Quox.Lexer + +import Quox.CharExtra +import Quox.Name +import Data.String.Extra +import Data.SortedMap +import Text.Lexer +import Text.Lexer.Tokenizer +import Generics.Derive +%hide TT.Name + + +%default total +%language ElabReflection + + +||| @ R reserved token +||| @ I identifier +||| @ N nat literal +||| @ S string literal +||| @ T tag literal +||| @ TYPE "Type" or "★" with subscript +public export +data Token = R String | I Name | N Nat | S String | T String | TYPE Nat +%runElab derive "Token" [Generic, Meta, Eq, Ord, DecEq, Show] + +-- token or whitespace +private +0 TokenW : Type +TokenW = Maybe Token + + +public export +record Error where + constructor Err + reason : StopReason + line, col : Int + ||| `Nothing` if the error is at the end of the input + char : Maybe Char +%runElab derive "StopReason" [Generic, Meta, Eq, Ord, DecEq, Show] +%runElab derive "Error" [Generic, Meta, Eq, Ord, DecEq, Show] + + +skip : Lexer -> Tokenizer TokenW +skip t = match t $ const Nothing + +export +syntaxChars : List Char +syntaxChars = ['(', ')', '[', ']', '{', '}', '`', '"', '\'', ',', '.'] + +export +isSymStart, isSymCont : Char -> Bool +isSymStart c = not (c `elem` syntaxChars) && isSymChar c +isSymCont c = c == '\'' || isSymStart c + +export +idStart, idCont, idEnd, idContEnd, symStart, symCont : Lexer +idStart = pred isIdStart +idCont = pred isIdCont +idEnd = pred $ \c => c `elem` unpack "?!#" +idContEnd = idCont <|> idEnd +symStart = pred isSymStart +symCont = pred isSymCont + +private %inline +resVal : String -> a -> Maybe Token +resVal str = const $ Just $ R str + +export +resWordL, resSymL, syntaxL : String -> Lexer +resWordL str = exact str <+> reject idContEnd +resSymL str = exact str <+> reject symCont +syntaxL str = exact str + +export +resWord, resSym, syntax : String -> Tokenizer TokenW +resWord str = match (resWordL str) (resVal str) +resSym str = match (resSymL str) (resVal str) +syntax str = match (syntaxL str) (resVal str) + +-- return value contains unicode version +export +uresWord, uresSym, usyntax : (unicode, ascii : String) -> Tokenizer TokenW +uresWord u a = match (resWordL u <|> resWordL a) (resVal u) +uresSym u a = match (resSymL u <|> resSymL a) (resVal u) +usyntax u a = match (exact u <|> exact a) (resVal u) + +export +alphaName, symName, baseNameL : Lexer +alphaName = idStart <+> many idCont <+> many idEnd +symName = symStart <+> many symCont +baseNameL = symName <|> alphaName + +export +baseName : Tokenizer BaseName +baseName = match baseNameL UN + +private +toName : String -> Name +toName = fromList . split (== '.') + +export +name : Tokenizer TokenW +name = match (baseNameL <+> many (is '.' <+> baseNameL)) + (Just . I . toName . normalizeNfc) + +-- [todo] escapes other than \" and (accidentally) \\ +export +fromStringLit : String -> String +fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where + go : List Char -> List Char + go [] = [] + go ['\\'] = ['\\'] -- i guess??? + go ('\\' :: c :: cs) = c :: go cs + go (c :: cs) = c :: go cs + +export +string : Tokenizer TokenW +string = match stringLit (Just . S . fromStringLit) + +export +nat : Tokenizer TokenW +nat = match (some (range '0' '9')) (Just . N . cast) + +export +tag : Tokenizer TokenW +tag = match (is '`' <+> baseNameL) (Just . T . drop 1) + <|> match (is '`' <+> stringLit) (Just . T . fromStringLit . drop 1) + + + +private %inline +fromSub : Char -> Char +fromSub c = case c of + '₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4' + '₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c + +private %inline +subToNat : String -> Nat +subToNat = cast . pack . map fromSub . unpack + + +export +universe : Tokenizer TokenW +universe = universeWith "★" <|> universeWith "Type" where + universeWith : String -> Tokenizer TokenW + universeWith pfx = + let len = length pfx in + match (exact pfx <+> some (range '0' '9')) + (Just . TYPE . cast . drop len) <|> + match (exact pfx <+> some (range '₀' '₉')) + (Just . TYPE . subToNat . drop len) + + +export +choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a +choice (t :: ts) = foldl (\a, b => a <|> b) t ts + + +export +tokens : Tokenizer TokenW +tokens = choice $ + map skip [pred isWhitespace, + lineComment (exact "--" <+> reject symCont), + blockComment (exact "{-") (exact "-}")] <+> + map syntax ["(", ")", "[", "]", ",", "{", "}", "."] <+> + [match (exact "`{" <+> reject (is '-')) (resVal "`{")] <+> + map resSym ["@", ":"] <+> + map (uncurry uresSym) + [("·", "%"), ("→","->"), ("×", "**"), ("≡", "=="), ("∷", "::")] <+> + map resWord ["case", "case1", "caseω", "return", "of", "_"] <+> + map (uncurry uresWord) [("λ","fun"), ("δ","dfun"), ("caseω", "case#")] <+> + [resWord "ω", match (resSymL "#") (resVal "ω"), + universe, resSym "★", match (resWordL "Type") (resVal "★")] <+> + [nat, string, tag, name] + +export +lex : String -> Either Error (List (WithBounds Token)) +lex str = + let (res, reason, line, col, str) = lex tokens str in + case reason of + EndInput => Right $ mapMaybe sequence res + _ => Left $ Err {reason, line, col, char = index 0 str} diff --git a/lib/Quox/Name.idr b/lib/Quox/Name.idr index 5524fa3..3387522 100644 --- a/lib/Quox/Name.idr +++ b/lib/Quox/Name.idr @@ -34,7 +34,7 @@ record Name where constructor MakeName mods : SnocList String base : BaseName -%runElab derive "Name" [Generic, Meta, Eq, Ord] +%runElab derive "Name" [Generic, Meta, Eq, Ord, DecEq] export Show Name where @@ -53,3 +53,10 @@ unq = MakeName [<] export toDots : Name -> String toDots x = fastConcat $ cast $ map (<+> ".") x.mods :< baseStr x.base + +export +fromList : List1 String -> Name +fromList (x ::: xs) = go [<] x xs where + go : SnocList String -> String -> List String -> Name + go mods x [] = MakeName mods (UN x) + go mods x (y :: ys) = go (mods :< x) y ys diff --git a/lib/quox-lib.ipkg b/lib/quox-lib.ipkg index 325eebe..454d54d 100644 --- a/lib/quox-lib.ipkg +++ b/lib/quox-lib.ipkg @@ -10,8 +10,8 @@ depends = base, contrib, elab-util, sop, snocvect modules = Quox.NatExtra, Quox.BoolExtra, + Quox.CharExtra, Quox.Decidable, - Quox.Unicode, Quox.No, Quox.Pretty, Quox.Syntax, @@ -34,4 +34,5 @@ modules = Quox.Equal, Quox.Name, Quox.Typing, - Quox.Typechecker + Quox.Typechecker, + Quox.Lexer diff --git a/tests/Tests.idr b/tests/Tests.idr index 5f6eade..aaebd75 100644 --- a/tests/Tests.idr +++ b/tests/Tests.idr @@ -4,6 +4,7 @@ import TAP import Tests.Reduce import Tests.Equal import Tests.Typechecker +import Tests.Lexer import System @@ -11,7 +12,8 @@ allTests : List Test allTests = [ Reduce.tests, Equal.tests, - Typechecker.tests + Typechecker.tests, + Lexer.tests ] main : IO () diff --git a/tests/Tests/Lexer.idr b/tests/Tests/Lexer.idr new file mode 100644 index 0000000..209eab7 --- /dev/null +++ b/tests/Tests/Lexer.idr @@ -0,0 +1,150 @@ +module Tests.Lexer + +import Quox.Name +import Quox.Lexer +import Text.Lexer.Tokenizer +import TAP + + +private +data Failure = LexError Lexer.Error + | WrongLex (List Token) (List Token) + | ExpectedFail (List Token) + +private +ToInfo Failure where + toInfo (LexError err) = + [("type", "LexError"), + ("info", show err)] + toInfo (WrongLex want got) = + [("type", "WrongLex"), + ("want", show want), + ("got", show got)] + toInfo (ExpectedFail got) = + [("type", "ExpectedFail"), + ("got", show got)] + + +private +denewline : String -> String +denewline = pack . map (\case '\n' => '␤'; c => c) . unpack + +private +lexes : String -> List Token -> Test +lexes str toks = test "「\{denewline str}」" {e = Failure} $ + case lex str of + Left err => throwError $ LexError err + Right res => + let res = map val res in + unless (toks == res) $ throwError $ WrongLex toks res + +private +lexFail : String -> Test +lexFail str = test "「\{denewline str}」 # fails" {e = Failure} $ + case lex str of + Left err => pure () + Right res => throwError $ ExpectedFail $ map val res + +export +tests : Test +tests = "lexer" :- [ + "comments" :- [ + lexes "" [], + lexes " " [], + lexes "-- line comment" [], + lexes "name -- line comment" [I "name"], + lexes "-- line comment\nnameBetween -- and another" [I "nameBetween"], + lexes "{- block comment -}" [], + lexes "{- {- nested -} block comment -}" [] + ], + + "identifiers & keywords" :- [ + lexes "abc" [I "abc"], + lexes "abc def" [I "abc", I "def"], + lexes "abc{-comment-}def" [I "abc", I "def"], + lexes "λ" [R "λ"], + lexes "fun" [R "λ"], + lexes "δ" [R "δ"], + lexes "dfun" [R "δ"], + lexes "funky" [I "funky"], + lexes "δελτα" [I "δελτα"], + lexes "a.b.c.d.e" [I $ MakeName [< "a","b","c","d"] "e"], + lexes "normalïse" [I "normalïse"], + -- ↑ replace i + combining ¨ with precomposed ï + + todo "check for reserved words in a qname", + -- lexes "abc.fun.def" [I "abc", R ".", R "λ", R ".", I "def"], + + lexes "+" [I "+"], + lexes "*" [I "*"], + lexes "**" [R "×"], + lexes "***" [I "***"], + lexes "+**" [I "+**"], + lexes "+.+.+" [I $ MakeName [< "+", "+"] "+"], + lexes "a.+" [I $ MakeName [< "a"] "+"], + lexes "+.a" [I $ MakeName [< "+"] "a"], + + lexes "+a" [I "+", I "a"], + + lexes "x." [I "x", R "."], + lexes "&." [I "&", R "."], + lexes ".x" [R ".", I "x"], + lexes "a.b.c." [I $ MakeName [< "a", "b"] "c", R "."], + + lexes "case" [R "case"], + lexes "caseω" [R "caseω"], + lexes "case#" [R "caseω"], + lexes "case1" [R "case1"], + lexes "case0" [I "case0"], + lexes "case##" [I "case##"], + + lexes "_" [R "_"], + lexes "_a" [I "_a"], + lexes "a_" [I "a_"], + + lexes "a'" [I "a'"], + lexes "+'" [I "+'"], + lexFail "'+" + ], + + "syntax characters" :- [ + lexes "()" [R "(", R ")"], + lexes "(a)" [R "(", I "a", R ")"], + lexes "(^)" [R "(", I "^", R ")"], + lexes "`{a,b}" [R "`{", I "a", R ",", I "b", R "}"], + lexes "`{+,-}" [R "`{", I "+", R ",", I "-", R "}"], + lexFail "` {}", + -- [todo] should this be lexed as "`{", "-", "mid", etc? + lexFail "`{-mid token comment-}{}" + ], + + "tags" :- [ + lexes "`a" [T "a"], + lexes "`abc" [T "abc"], + lexes "`+" [T "+"], + lexes "`$$$" [T "$$$"], + lexes #"`"multi word tag""# [T "multi word tag"], + lexFail "`", + lexFail "``" + ], + + "strings" :- [ + lexes #" "" "# [S ""], + lexes #" "abc" "# [S "abc"], + lexes #" "\"" "# [S "\""], + lexes #" "\\" "# [S "\\"], + lexes #" "\\\"" "# [S "\\\""], + todo "other escapes" + ], + + "universes" :- [ + lexes "Type0" [TYPE 0], + lexes "Type₀" [TYPE 0], + lexes "Type9999999" [TYPE 9999999], + lexes "★₀" [TYPE 0], + lexes "★₆₉" [TYPE 69], + lexes "★4" [TYPE 4], + lexes "Type" [R "★"], + lexes "★" [R "★"] + ] +]