From dbe248be9ac159381db1ec0aeb4936becaab1504 Mon Sep 17 00:00:00 2001
From: rhiannon morris <rhi@rhiannon.website>
Date: Tue, 28 Feb 2023 20:51:54 +0100
Subject: [PATCH] lexer

---
 lib/Quox/{Unicode.idr => CharExtra.idr} |  19 +--
 lib/Quox/Lexer.idr                      | 183 ++++++++++++++++++++++++
 lib/Quox/Name.idr                       |   9 +-
 lib/quox-lib.ipkg                       |   5 +-
 tests/Tests.idr                         |   4 +-
 tests/Tests/Lexer.idr                   | 150 +++++++++++++++++++
 6 files changed, 357 insertions(+), 13 deletions(-)
 rename lib/Quox/{Unicode.idr => CharExtra.idr} (92%)
 create mode 100644 lib/Quox/Lexer.idr
 create mode 100644 tests/Tests/Lexer.idr

diff --git a/lib/Quox/Unicode.idr b/lib/Quox/CharExtra.idr
similarity index 92%
rename from lib/Quox/Unicode.idr
rename to lib/Quox/CharExtra.idr
index 5cd337b..708261f 100644
--- a/lib/Quox/Unicode.idr
+++ b/lib/Quox/CharExtra.idr
@@ -1,4 +1,4 @@
-module Quox.Unicode
+module Quox.CharExtra
 
 import Generics.Derive
 
@@ -123,11 +123,13 @@ namespace Char
 
 export
 isIdStart : Char -> Bool
-isIdStart ch = isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9')
+isIdStart ch =
+  ch == '_' || isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9')
 
 export
 isIdCont : Char -> Bool
-isIdCont ch = isIdStart ch || ch == '\'' || isMark ch || isNumber ch
+isIdCont ch =
+  isIdStart ch || ch == '\'' || isMark ch || isNumber ch
 
 export
 isIdConnector : Char -> Bool
@@ -136,12 +138,11 @@ isIdConnector ch = genCat ch == Punctuation Connector
 
 export
 isSymChar : Char -> Bool
-isSymChar ch =
-  case genCat ch of
-    Symbol      _     => True
-    Punctuation Dash  => True
-    Punctuation Other => True
-    _                 => False
+isSymChar ch = case genCat ch of
+  Symbol      _     => True
+  Punctuation Dash  => True
+  Punctuation Other => True
+  _                 => False
 
 export
 isWhitespace : Char -> Bool
diff --git a/lib/Quox/Lexer.idr b/lib/Quox/Lexer.idr
new file mode 100644
index 0000000..2b2cb9f
--- /dev/null
+++ b/lib/Quox/Lexer.idr
@@ -0,0 +1,183 @@
+module Quox.Lexer
+
+import Quox.CharExtra
+import Quox.Name
+import Data.String.Extra
+import Data.SortedMap
+import Text.Lexer
+import Text.Lexer.Tokenizer
+import Generics.Derive
+%hide TT.Name
+
+
+%default total
+%language ElabReflection
+
+
+||| @ R reserved token
+||| @ I identifier
+||| @ N nat literal
+||| @ S string literal
+||| @ T tag literal
+||| @ TYPE "Type" or "★" with subscript
+public export
+data Token = R String | I Name | N Nat | S String | T String | TYPE Nat
+%runElab derive "Token" [Generic, Meta, Eq, Ord, DecEq, Show]
+
+-- token or whitespace
+private
+0 TokenW : Type
+TokenW = Maybe Token
+
+
+public export
+record Error where
+  constructor Err
+  reason    : StopReason
+  line, col : Int
+  ||| `Nothing` if the error is at the end of the input
+  char      : Maybe Char
+%runElab derive "StopReason" [Generic, Meta, Eq, Ord, DecEq, Show]
+%runElab derive "Error"      [Generic, Meta, Eq, Ord, DecEq, Show]
+
+
+skip : Lexer -> Tokenizer TokenW
+skip t = match t $ const Nothing
+
+export
+syntaxChars : List Char
+syntaxChars = ['(', ')', '[', ']', '{', '}', '`', '"', '\'', ',', '.']
+
+export
+isSymStart, isSymCont : Char -> Bool
+isSymStart c = not (c `elem` syntaxChars) && isSymChar c
+isSymCont  c = c == '\'' || isSymStart c
+
+export
+idStart, idCont, idEnd, idContEnd, symStart, symCont : Lexer
+idStart   = pred isIdStart
+idCont    = pred isIdCont
+idEnd     = pred $ \c => c `elem` unpack "?!#"
+idContEnd = idCont <|> idEnd
+symStart  = pred isSymStart
+symCont   = pred isSymCont
+
+private %inline
+resVal : String -> a -> Maybe Token
+resVal str = const $ Just $ R str
+
+export
+resWordL, resSymL, syntaxL : String -> Lexer
+resWordL str = exact str <+> reject idContEnd
+resSymL  str = exact str <+> reject symCont
+syntaxL  str = exact str
+
+export
+resWord, resSym, syntax : String -> Tokenizer TokenW
+resWord str = match (resWordL str) (resVal str)
+resSym  str = match (resSymL  str) (resVal str)
+syntax  str = match (syntaxL  str) (resVal str)
+
+-- return value contains unicode version
+export
+uresWord, uresSym, usyntax : (unicode, ascii : String) -> Tokenizer TokenW
+uresWord u a = match (resWordL u <|> resWordL a) (resVal u)
+uresSym  u a = match (resSymL u <|> resSymL a) (resVal u)
+usyntax  u a = match (exact u <|> exact a) (resVal u)
+
+export
+alphaName, symName, baseNameL : Lexer
+alphaName = idStart <+> many idCont <+> many idEnd
+symName   = symStart <+> many symCont
+baseNameL = symName <|> alphaName
+
+export
+baseName : Tokenizer BaseName
+baseName = match baseNameL UN
+
+private
+toName : String -> Name
+toName = fromList . split (== '.')
+
+export
+name : Tokenizer TokenW
+name = match (baseNameL <+> many (is '.' <+> baseNameL))
+             (Just . I . toName . normalizeNfc)
+
+-- [todo] escapes other than \" and (accidentally) \\
+export
+fromStringLit : String -> String
+fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
+  go : List Char -> List Char
+  go []                = []
+  go ['\\']            = ['\\'] -- i guess???
+  go ('\\' :: c :: cs) = c :: go cs
+  go (c :: cs)         = c :: go cs
+
+export
+string : Tokenizer TokenW
+string = match stringLit (Just . S . fromStringLit)
+
+export
+nat : Tokenizer TokenW
+nat = match (some (range '0' '9')) (Just . N . cast)
+
+export
+tag : Tokenizer TokenW
+tag = match (is '`' <+> baseNameL) (Just . T . drop 1)
+  <|> match (is '`' <+> stringLit) (Just . T . fromStringLit . drop 1)
+
+
+
+private %inline
+fromSub : Char -> Char
+fromSub c = case c of
+  '₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4'
+  '₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c
+
+private %inline
+subToNat : String -> Nat
+subToNat = cast . pack . map fromSub . unpack
+
+
+export
+universe : Tokenizer TokenW
+universe = universeWith "★" <|> universeWith "Type" where
+  universeWith : String -> Tokenizer TokenW
+  universeWith pfx =
+    let len = length pfx in
+    match (exact pfx <+> some (range '0' '9'))
+          (Just . TYPE . cast . drop len) <|>
+    match (exact pfx <+> some (range '₀' '₉'))
+          (Just . TYPE . subToNat . drop len)
+
+
+export
+choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
+choice (t :: ts) = foldl (\a, b => a <|> b) t ts
+
+
+export
+tokens : Tokenizer TokenW
+tokens = choice $
+  map skip [pred isWhitespace,
+            lineComment (exact "--" <+> reject symCont),
+            blockComment (exact "{-") (exact "-}")] <+>
+  map syntax ["(", ")", "[", "]", ",", "{", "}", "."] <+>
+  [match (exact "`{" <+> reject (is '-')) (resVal "`{")] <+>
+  map resSym ["@", ":"] <+>
+  map (uncurry uresSym)
+    [("·", "%"), ("→","->"), ("×", "**"), ("≡", "=="), ("∷", "::")] <+>
+  map resWord ["case", "case1", "caseω", "return", "of", "_"] <+>
+  map (uncurry uresWord) [("λ","fun"), ("δ","dfun"), ("caseω", "case#")] <+>
+  [resWord "ω", match (resSymL "#") (resVal "ω"),
+   universe, resSym "★", match (resWordL "Type") (resVal "★")] <+>
+  [nat, string, tag, name]
+
+export
+lex : String -> Either Error (List (WithBounds Token))
+lex str =
+  let (res, reason, line, col, str) = lex tokens str in
+  case reason of
+    EndInput => Right $ mapMaybe sequence res
+    _        => Left  $ Err {reason, line, col, char = index 0 str}
diff --git a/lib/Quox/Name.idr b/lib/Quox/Name.idr
index 5524fa3..3387522 100644
--- a/lib/Quox/Name.idr
+++ b/lib/Quox/Name.idr
@@ -34,7 +34,7 @@ record Name where
   constructor MakeName
   mods : SnocList String
   base : BaseName
-%runElab derive "Name" [Generic, Meta, Eq, Ord]
+%runElab derive "Name" [Generic, Meta, Eq, Ord, DecEq]
 
 export
 Show Name where
@@ -53,3 +53,10 @@ unq = MakeName [<]
 export
 toDots : Name -> String
 toDots x = fastConcat $ cast $ map (<+> ".") x.mods :< baseStr x.base
+
+export
+fromList : List1 String -> Name
+fromList (x ::: xs) = go [<] x xs where
+  go : SnocList String -> String -> List String -> Name
+  go mods x []        = MakeName mods (UN x)
+  go mods x (y :: ys) = go (mods :< x) y ys
diff --git a/lib/quox-lib.ipkg b/lib/quox-lib.ipkg
index 325eebe..454d54d 100644
--- a/lib/quox-lib.ipkg
+++ b/lib/quox-lib.ipkg
@@ -10,8 +10,8 @@ depends = base, contrib, elab-util, sop, snocvect
 modules =
   Quox.NatExtra,
   Quox.BoolExtra,
+  Quox.CharExtra,
   Quox.Decidable,
-  Quox.Unicode,
   Quox.No,
   Quox.Pretty,
   Quox.Syntax,
@@ -34,4 +34,5 @@ modules =
   Quox.Equal,
   Quox.Name,
   Quox.Typing,
-  Quox.Typechecker
+  Quox.Typechecker,
+  Quox.Lexer
diff --git a/tests/Tests.idr b/tests/Tests.idr
index 5f6eade..aaebd75 100644
--- a/tests/Tests.idr
+++ b/tests/Tests.idr
@@ -4,6 +4,7 @@ import TAP
 import Tests.Reduce
 import Tests.Equal
 import Tests.Typechecker
+import Tests.Lexer
 import System
 
 
@@ -11,7 +12,8 @@ allTests : List Test
 allTests = [
   Reduce.tests,
   Equal.tests,
-  Typechecker.tests
+  Typechecker.tests,
+  Lexer.tests
 ]
 
 main : IO ()
diff --git a/tests/Tests/Lexer.idr b/tests/Tests/Lexer.idr
new file mode 100644
index 0000000..209eab7
--- /dev/null
+++ b/tests/Tests/Lexer.idr
@@ -0,0 +1,150 @@
+module Tests.Lexer
+
+import Quox.Name
+import Quox.Lexer
+import Text.Lexer.Tokenizer
+import TAP
+
+
+private
+data Failure = LexError Lexer.Error
+             | WrongLex (List Token) (List Token)
+             | ExpectedFail (List Token)
+
+private
+ToInfo Failure where
+  toInfo (LexError err) =
+    [("type", "LexError"),
+     ("info", show err)]
+  toInfo (WrongLex want got) =
+    [("type", "WrongLex"),
+     ("want", show want),
+     ("got",  show got)]
+  toInfo (ExpectedFail got) =
+    [("type", "ExpectedFail"),
+     ("got",  show got)]
+
+
+private
+denewline : String -> String
+denewline = pack . map (\case '\n' => '␤'; c => c) . unpack
+
+private
+lexes : String -> List Token -> Test
+lexes str toks = test "｢\{denewline str}｣" {e = Failure} $
+  case lex str of
+    Left  err => throwError $ LexError err
+    Right res =>
+      let res = map val res in
+      unless (toks == res) $ throwError $ WrongLex toks res
+
+private
+lexFail : String -> Test
+lexFail str = test "｢\{denewline str}｣   # fails" {e = Failure} $
+  case lex str of
+    Left  err => pure ()
+    Right res => throwError $ ExpectedFail $ map val res
+
+export
+tests : Test
+tests = "lexer" :- [
+  "comments" :- [
+    lexes "" [],
+    lexes "    " [],
+    lexes "-- line comment" [],
+    lexes "name -- line comment" [I "name"],
+    lexes "-- line comment\nnameBetween -- and another" [I "nameBetween"],
+    lexes "{- block comment -}" [],
+    lexes "{- {- nested -} block comment -}" []
+  ],
+
+  "identifiers & keywords" :- [
+    lexes "abc" [I "abc"],
+    lexes "abc def" [I "abc", I "def"],
+    lexes "abc{-comment-}def" [I "abc", I "def"],
+    lexes "λ" [R "λ"],
+    lexes "fun" [R "λ"],
+    lexes "δ" [R "δ"],
+    lexes "dfun" [R "δ"],
+    lexes "funky" [I "funky"],
+    lexes "δελτα" [I "δελτα"],
+    lexes "a.b.c.d.e" [I $ MakeName [< "a","b","c","d"] "e"],
+    lexes "normalïse" [I "normalïse"],
+      -- ↑ replace i + combining ¨ with precomposed ï
+
+    todo "check for reserved words in a qname",
+    -- lexes "abc.fun.def" [I "abc", R ".", R "λ", R ".", I "def"],
+
+    lexes "+" [I "+"],
+    lexes "*" [I "*"],
+    lexes "**" [R "×"],
+    lexes "***" [I "***"],
+    lexes "+**" [I "+**"],
+    lexes "+.+.+" [I $ MakeName [< "+", "+"] "+"],
+    lexes "a.+" [I $ MakeName [< "a"] "+"],
+    lexes "+.a" [I $ MakeName [< "+"] "a"],
+
+    lexes "+a" [I "+", I "a"],
+
+    lexes "x." [I "x", R "."],
+    lexes "&." [I "&", R "."],
+    lexes ".x" [R ".", I "x"],
+    lexes "a.b.c." [I $ MakeName [< "a", "b"] "c", R "."],
+
+    lexes "case" [R "case"],
+    lexes "caseω" [R "caseω"],
+    lexes "case#" [R "caseω"],
+    lexes "case1" [R "case1"],
+    lexes "case0" [I "case0"],
+    lexes "case##" [I "case##"],
+
+    lexes "_" [R "_"],
+    lexes "_a" [I "_a"],
+    lexes "a_" [I "a_"],
+
+    lexes "a'" [I "a'"],
+    lexes "+'" [I "+'"],
+    lexFail "'+"
+  ],
+
+  "syntax characters" :- [
+    lexes "()" [R "(", R ")"],
+    lexes "(a)" [R "(", I "a", R ")"],
+    lexes "(^)" [R "(", I "^", R ")"],
+    lexes "`{a,b}" [R "`{", I "a", R ",", I "b", R "}"],
+    lexes "`{+,-}" [R "`{", I "+", R ",", I "-", R "}"],
+    lexFail "` {}",
+    -- [todo] should this be lexed as "`{", "-", "mid", etc?
+    lexFail "`{-mid token comment-}{}"
+  ],
+
+  "tags" :- [
+    lexes "`a" [T "a"],
+    lexes "`abc" [T "abc"],
+    lexes "`+" [T "+"],
+    lexes "`$$$" [T "$$$"],
+    lexes #"`"multi word tag""# [T "multi word tag"],
+    lexFail "`",
+    lexFail "``"
+  ],
+
+  "strings" :- [
+    lexes #" "" "# [S ""],
+    lexes #" "abc" "# [S "abc"],
+    lexes #" "\"" "# [S "\""],
+    lexes #" "\\" "# [S "\\"],
+    lexes #" "\\\"" "# [S "\\\""],
+    todo "other escapes"
+  ],
+
+  "universes" :- [
+    lexes "Type0" [TYPE 0],
+    lexes "Type₀" [TYPE 0],
+    lexes "Type9999999" [TYPE 9999999],
+    lexes "★₀" [TYPE 0],
+    lexes "★₆₉" [TYPE 69],
+    lexes "★4" [TYPE 4],
+    lexes "Type" [R "★"],
+    lexes "★" [R "★"]
+  ]
+]