lexer

2023-02-28 20:51:54 +01:00 · 2023-02-28 20:51:54 +01:00 · dbe248be9a
commit dbe248be9a
parent cacb3225a2
6 changed files with 357 additions and 13 deletions
--- a/lib/Quox/CharExtra.idr
+++ b/lib/Quox/CharExtra.idr
@ -1,4 +1,4 @@
-module Quox.Unicode
+module Quox.CharExtra
 import Generics.Derive
@ -123,11 +123,13 @@ namespace Char
 export
 isIdStart : Char -> Bool
-isIdStart ch = isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9')
+isIdStart ch =
  ch == '_' || isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9')
 export
 isIdCont : Char -> Bool
-isIdCont ch = isIdStart ch || ch == '\'' || isMark ch || isNumber ch
+isIdCont ch =
  isIdStart ch || ch == '\'' || isMark ch || isNumber ch
 export
 isIdConnector : Char -> Bool
@ -136,8 +138,7 @@ isIdConnector ch = genCat ch == Punctuation Connector
 export
 isSymChar : Char -> Bool
-isSymChar ch =
+isSymChar ch = case genCat ch of
  case genCat ch of
  Symbol      _     => True
  Punctuation Dash  => True
  Punctuation Other => True
--- a/lib/Quox/Lexer.idr
+++ b/lib/Quox/Lexer.idr
@ -0,0 +1,183 @@
 module Quox.Lexer
 import Quox.CharExtra
 import Quox.Name
 import Data.String.Extra
 import Data.SortedMap
 import Text.Lexer
 import Text.Lexer.Tokenizer
 import Generics.Derive
 %hide TT.Name
 %default total
 %language ElabReflection
 ||| @ R reserved token
 ||| @ I identifier
 ||| @ N nat literal
 ||| @ S string literal
 ||| @ T tag literal
 ||| @ TYPE "Type" or "★" with subscript
 public export
 data Token = R String | I Name | N Nat | S String | T String | TYPE Nat
 %runElab derive "Token" [Generic, Meta, Eq, Ord, DecEq, Show]
 -- token or whitespace
 private
 0 TokenW : Type
 TokenW = Maybe Token
 public export
 record Error where
  constructor Err
  reason    : StopReason
  line, col : Int
  ||| `Nothing` if the error is at the end of the input
  char      : Maybe Char
 %runElab derive "StopReason" [Generic, Meta, Eq, Ord, DecEq, Show]
 %runElab derive "Error"      [Generic, Meta, Eq, Ord, DecEq, Show]
 skip : Lexer -> Tokenizer TokenW
 skip t = match t $ const Nothing
 export
 syntaxChars : List Char
 syntaxChars = ['(', ')', '[', ']', '{', '}', '`', '"', '\'', ',', '.']
 export
 isSymStart, isSymCont : Char -> Bool
 isSymStart c = not (c `elem` syntaxChars) && isSymChar c
 isSymCont  c = c == '\'' || isSymStart c
 export
 idStart, idCont, idEnd, idContEnd, symStart, symCont : Lexer
 idStart   = pred isIdStart
 idCont    = pred isIdCont
 idEnd     = pred $ \c => c `elem` unpack "?!#"
 idContEnd = idCont <|> idEnd
 symStart  = pred isSymStart
 symCont   = pred isSymCont
 private %inline
 resVal : String -> a -> Maybe Token
 resVal str = const $ Just $ R str
 export
 resWordL, resSymL, syntaxL : String -> Lexer
 resWordL str = exact str <+> reject idContEnd
 resSymL  str = exact str <+> reject symCont
 syntaxL  str = exact str
 export
 resWord, resSym, syntax : String -> Tokenizer TokenW
 resWord str = match (resWordL str) (resVal str)
 resSym  str = match (resSymL  str) (resVal str)
 syntax  str = match (syntaxL  str) (resVal str)
 -- return value contains unicode version
 export
 uresWord, uresSym, usyntax : (unicode, ascii : String) -> Tokenizer TokenW
 uresWord u a = match (resWordL u <|> resWordL a) (resVal u)
 uresSym  u a = match (resSymL u <|> resSymL a) (resVal u)
 usyntax  u a = match (exact u <|> exact a) (resVal u)
 export
 alphaName, symName, baseNameL : Lexer
 alphaName = idStart <+> many idCont <+> many idEnd
 symName   = symStart <+> many symCont
 baseNameL = symName <|> alphaName
 export
 baseName : Tokenizer BaseName
 baseName = match baseNameL UN
 private
 toName : String -> Name
 toName = fromList . split (== '.')
 export
 name : Tokenizer TokenW
 name = match (baseNameL <+> many (is '.' <+> baseNameL))
             (Just . I . toName . normalizeNfc)
 -- [todo] escapes other than \" and (accidentally) \\
 export
 fromStringLit : String -> String
 fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
  go : List Char -> List Char
  go []                = []
  go ['\\']            = ['\\'] -- i guess???
  go ('\\' :: c :: cs) = c :: go cs
  go (c :: cs)         = c :: go cs
 export
 string : Tokenizer TokenW
 string = match stringLit (Just . S . fromStringLit)
 export
 nat : Tokenizer TokenW
 nat = match (some (range '0' '9')) (Just . N . cast)
 export
 tag : Tokenizer TokenW
 tag = match (is '`' <+> baseNameL) (Just . T . drop 1)
  <|> match (is '`' <+> stringLit) (Just . T . fromStringLit . drop 1)
 private %inline
 fromSub : Char -> Char
 fromSub c = case c of
  '₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4'
  '₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c
 private %inline
 subToNat : String -> Nat
 subToNat = cast . pack . map fromSub . unpack
 export
 universe : Tokenizer TokenW
 universe = universeWith "★" <|> universeWith "Type" where
  universeWith : String -> Tokenizer TokenW
  universeWith pfx =
    let len = length pfx in
    match (exact pfx <+> some (range '0' '9'))
          (Just . TYPE . cast . drop len) <|>
    match (exact pfx <+> some (range '₀' '₉'))
          (Just . TYPE . subToNat . drop len)
 export
 choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
 choice (t :: ts) = foldl (\a, b => a <|> b) t ts
 export
 tokens : Tokenizer TokenW
 tokens = choice $
  map skip [pred isWhitespace,
            lineComment (exact "--" <+> reject symCont),
            blockComment (exact "{-") (exact "-}")] <+>
  map syntax ["(", ")", "[", "]", ",", "{", "}", "."] <+>
  [match (exact "`{" <+> reject (is '-')) (resVal "`{")] <+>
  map resSym ["@", ":"] <+>
  map (uncurry uresSym)
    [("·", "%"), ("→","->"), ("×", "**"), ("≡", "=="), ("∷", "::")] <+>
  map resWord ["case", "case1", "caseω", "return", "of", "_"] <+>
  map (uncurry uresWord) [("λ","fun"), ("δ","dfun"), ("caseω", "case#")] <+>
  [resWord "ω", match (resSymL "#") (resVal "ω"),
   universe, resSym "★", match (resWordL "Type") (resVal "★")] <+>
  [nat, string, tag, name]
 export
 lex : String -> Either Error (List (WithBounds Token))
 lex str =
  let (res, reason, line, col, str) = lex tokens str in
  case reason of
    EndInput => Right $ mapMaybe sequence res
    _        => Left  $ Err {reason, line, col, char = index 0 str}
--- a/lib/Quox/Name.idr
+++ b/lib/Quox/Name.idr
@ -34,7 +34,7 @@ record Name where
  constructor MakeName
  mods : SnocList String
  base : BaseName
-%runElab derive "Name" [Generic, Meta, Eq, Ord]
+%runElab derive "Name" [Generic, Meta, Eq, Ord, DecEq]
 export
 Show Name where
@ -53,3 +53,10 @@ unq = MakeName [<]
 export
 toDots : Name -> String
 toDots x = fastConcat $ cast $ map (<+> ".") x.mods :< baseStr x.base
 export
 fromList : List1 String -> Name
 fromList (x ::: xs) = go [<] x xs where
  go : SnocList String -> String -> List String -> Name
  go mods x []        = MakeName mods (UN x)
  go mods x (y :: ys) = go (mods :< x) y ys
--- a/lib/quox-lib.ipkg
+++ b/lib/quox-lib.ipkg
@ -10,8 +10,8 @@ depends = base, contrib, elab-util, sop, snocvect
 modules =
  Quox.NatExtra,
  Quox.BoolExtra,
  Quox.CharExtra,
  Quox.Decidable,
  Quox.Unicode,
  Quox.No,
  Quox.Pretty,
  Quox.Syntax,
@ -34,4 +34,5 @@ modules =
  Quox.Equal,
  Quox.Name,
  Quox.Typing,
-  Quox.Typechecker
+  Quox.Typechecker,
  Quox.Lexer
--- a/tests/Tests.idr
+++ b/tests/Tests.idr
@ -4,6 +4,7 @@ import TAP
 import Tests.Reduce
 import Tests.Equal
 import Tests.Typechecker
 import Tests.Lexer
 import System
@ -11,7 +12,8 @@ allTests : List Test
 allTests = [
  Reduce.tests,
  Equal.tests,
-  Typechecker.tests
+  Typechecker.tests,
  Lexer.tests
 ]
 main : IO ()
--- a/tests/Tests/Lexer.idr
+++ b/tests/Tests/Lexer.idr
@ -0,0 +1,150 @@
 module Tests.Lexer
 import Quox.Name
 import Quox.Lexer
 import Text.Lexer.Tokenizer
 import TAP
 private
 data Failure = LexError Lexer.Error
             | WrongLex (List Token) (List Token)
             | ExpectedFail (List Token)
 private
 ToInfo Failure where
  toInfo (LexError err) =
    [("type", "LexError"),
     ("info", show err)]
  toInfo (WrongLex want got) =
    [("type", "WrongLex"),
     ("want", show want),
     ("got",  show got)]
  toInfo (ExpectedFail got) =
    [("type", "ExpectedFail"),
     ("got",  show got)]
 private
 denewline : String -> String
 denewline = pack . map (\case '\n' => '␤'; c => c) . unpack
 private
 lexes : String -> List Token -> Test
 lexes str toks = test "｢\{denewline str}｣" {e = Failure} $
  case lex str of
    Left  err => throwError $ LexError err
    Right res =>
      let res = map val res in
      unless (toks == res) $ throwError $ WrongLex toks res
 private
 lexFail : String -> Test
 lexFail str = test "｢\{denewline str}｣   # fails" {e = Failure} $
  case lex str of
    Left  err => pure ()
    Right res => throwError $ ExpectedFail $ map val res
 export
 tests : Test
 tests = "lexer" :- [
  "comments" :- [
    lexes "" [],
    lexes "    " [],
    lexes "-- line comment" [],
    lexes "name -- line comment" [I "name"],
    lexes "-- line comment\nnameBetween -- and another" [I "nameBetween"],
    lexes "{- block comment -}" [],
    lexes "{- {- nested -} block comment -}" []
  ],
  "identifiers & keywords" :- [
    lexes "abc" [I "abc"],
    lexes "abc def" [I "abc", I "def"],
    lexes "abc{-comment-}def" [I "abc", I "def"],
    lexes "λ" [R "λ"],
    lexes "fun" [R "λ"],
    lexes "δ" [R "δ"],
    lexes "dfun" [R "δ"],
    lexes "funky" [I "funky"],
    lexes "δελτα" [I "δελτα"],
    lexes "a.b.c.d.e" [I $ MakeName [< "a","b","c","d"] "e"],
    lexes "normalïse" [I "normalïse"],
      -- ↑ replace i + combining ¨ with precomposed ï
    todo "check for reserved words in a qname",
    -- lexes "abc.fun.def" [I "abc", R ".", R "λ", R ".", I "def"],
    lexes "+" [I "+"],
    lexes "*" [I "*"],
    lexes "**" [R "×"],
    lexes "***" [I "***"],
    lexes "+**" [I "+**"],
    lexes "+.+.+" [I $ MakeName [< "+", "+"] "+"],
    lexes "a.+" [I $ MakeName [< "a"] "+"],
    lexes "+.a" [I $ MakeName [< "+"] "a"],
    lexes "+a" [I "+", I "a"],
    lexes "x." [I "x", R "."],
    lexes "&." [I "&", R "."],
    lexes ".x" [R ".", I "x"],
    lexes "a.b.c." [I $ MakeName [< "a", "b"] "c", R "."],
    lexes "case" [R "case"],
    lexes "caseω" [R "caseω"],
    lexes "case#" [R "caseω"],
    lexes "case1" [R "case1"],
    lexes "case0" [I "case0"],
    lexes "case##" [I "case##"],
    lexes "_" [R "_"],
    lexes "_a" [I "_a"],
    lexes "a_" [I "a_"],
    lexes "a'" [I "a'"],
    lexes "+'" [I "+'"],
    lexFail "'+"
  ],
  "syntax characters" :- [
    lexes "()" [R "(", R ")"],
    lexes "(a)" [R "(", I "a", R ")"],
    lexes "(^)" [R "(", I "^", R ")"],
    lexes "`{a,b}" [R "`{", I "a", R ",", I "b", R "}"],
    lexes "`{+,-}" [R "`{", I "+", R ",", I "-", R "}"],
    lexFail "` {}",
    -- [todo] should this be lexed as "`{", "-", "mid", etc?
    lexFail "`{-mid token comment-}{}"
  ],
  "tags" :- [
    lexes "`a" [T "a"],
    lexes "`abc" [T "abc"],
    lexes "`+" [T "+"],
    lexes "`$$$" [T "$$$"],
    lexes #"`"multi word tag""# [T "multi word tag"],
    lexFail "`",
    lexFail "``"
  ],
  "strings" :- [
    lexes #" "" "# [S ""],
    lexes #" "abc" "# [S "abc"],
    lexes #" "\"" "# [S "\""],
    lexes #" "\\" "# [S "\\"],
    lexes #" "\\\"" "# [S "\\\""],
    todo "other escapes"
  ],
  "universes" :- [
    lexes "Type0" [TYPE 0],
    lexes "Type₀" [TYPE 0],
    lexes "Type9999999" [TYPE 9999999],
    lexes "★₀" [TYPE 0],
    lexes "★₆₉" [TYPE 69],
    lexes "★4" [TYPE 4],
    lexes "Type" [R "★"],
    lexes "★" [R "★"]
  ]
 ]