quox/lib/Quox/Lexer.idr

module Quox.Lexer

import Quox.CharExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
import public Data.String -- for singleton to reduce in IsReserved
import public Data.List.Elem
import public Text.Lexer
import public Text.Lexer.Tokenizer
import Derive.Prelude
%hide TT.Name


%default total
%language ElabReflection


||| @ Reserved reserved token
||| @ Name name, possibly qualified
||| @ Nat nat literal
||| @ String string literal
||| @ Tag tag literal
||| @ TYPE "Type" or "★" with subscript
public export
data Token =
    Reserved String
  | Name Name
  | Nat Nat
  | Str String
  | Tag String
  | TYPE Nat
%runElab derive "Token" [Eq, Ord, Show]

-- token or whitespace
public export
0 TokenW : Type
TokenW = Maybe Token


public export
record Error where
  constructor Err
  reason    : StopReason
  line, col : Int
  ||| `Nothing` if the error is at the end of the input
  char      : Maybe Char
%runElab derive "StopReason" [Eq, Ord, Show]
%runElab derive "Error"      [Eq, Ord, Show]


private
skip : Lexer -> Tokenizer TokenW
skip t = match t $ const Nothing

private
match : Lexer -> (String -> Token) -> Tokenizer TokenW
match t f = Tokenizer.match t (Just . f)
%hide Tokenizer.match


export %inline
syntaxChars : List Char
syntaxChars = ['(', ')', '[', ']', '{', '}', '"', '\'', ',', '.', ';']

private
stra, isSymCont : Char -> Bool
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
isSymCont  c = c == '\'' || isSymStart c

private
idStart, idCont, idEnd, idContEnd : Lexer
idStart   = pred isIdStart
idCont    = pred isIdCont
idEnd     = pred $ \c => c `elem` unpack "?!#"
idContEnd = idCont <|> idEnd

private
symStart, symCont : Lexer
symStart  = pred isSymStart
symCont   = pred isSymCont

private
baseNameL : Lexer
baseNameL = idStart <+> many idCont <+> many idEnd
        <|> symStart <+> many symCont

private
nameL : Lexer
nameL = baseNameL <+> many (is '.' <+> baseNameL)

private
name : Tokenizer TokenW
name = match nameL $ Name . fromList . split (== '.') . normalizeNfc

||| [todo] escapes other than `\"` and (accidentally) `\\`
export
fromStringLit : String -> String
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
  go : List Char -> List Char
  go []                = []
  go ['\\']            = ['\\'] -- i guess???
  go ('\\' :: c :: cs) = c :: go cs
  go (c :: cs)         = c :: go cs

private
string : Tokenizer TokenW
string = match stringLit (Str . fromStringLit)

private
nat : Tokenizer TokenW
nat = match (some (range '0' '9')) (Nat . cast)

private
tag : Tokenizer TokenW
tag = match (is '\'' <+> nameL)     (Tag . drop 1)
  <|> match (is '\'' <+> stringLit) (Tag . fromStringLit . drop 1)


private %inline
fromSub : Char -> Char
fromSub c = case c of
  '₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4'
  '₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c

private %inline
subToNat : String -> Nat
subToNat = cast . pack . map fromSub . unpack


private
universe : Tokenizer TokenW
universe = universeWith "★" <|> universeWith "Type" where
  universeWith : String -> Tokenizer TokenW
  universeWith pfx =
    let len = length pfx in
    match (exact pfx <+> some (range '0' '9'))
          (TYPE . cast . drop len) <|>
    match (exact pfx <+> some (range '₀' '₉'))
          (TYPE . subToNat . drop len)


private %inline
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
choice (t :: ts) = foldl (\a, b => a <|> b) t ts


namespace Reserved
  ||| description of a reserved symbol
  ||| @ Word a reserved word (must not be followed by letters, digits, etc)
  ||| @ Sym a reserved symbol (must not be followed by symbolic chars)
  ||| @ Punc a character that doesn't show up in names (brackets, etc)
  public export
  data Reserved1 = Word String | Sym String | Punc Char
  %runElab derive "Reserved1" [Eq, Ord, Show]

  ||| description of a token that might have unicode & ascii-only aliases
  public export
  data Reserved = Only Reserved1 | Or Reserved1 Reserved1
  %runElab derive "Reserved" [Eq, Ord, Show]

  public export
  Sym1, Word1 : String -> Reserved
  Sym1 = Only . Sym
  Word1 = Only . Word

  public export
  Punc1 : Char -> Reserved
  Punc1 = Only . Punc

public export
resString1 : Reserved1 -> String
resString1 (Punc x) = singleton x
resString1 (Word w) = w
resString1 (Sym  s) = s

||| return the representative string for a token description. if there are
||| two, then it's the first one, which should be the full-unicode one
public export
resString : Reserved -> String
resString (Only r)   = resString1 r
resString (r `Or` _) = resString1 r

private
resTokenizer1 : Reserved1 -> String -> Tokenizer TokenW
resTokenizer1 r str =
  let res : String -> Token := const $ Reserved str in
  case r of Word w => match (exact w <+> reject idContEnd) res
            Sym s  => match (exact s <+> reject symCont)   res
            Punc x => match (is x)                         res

||| match a reserved token
export
resTokenizer : Reserved -> Tokenizer TokenW
resTokenizer (Only r)   = resTokenizer1 r (resString1 r)
resTokenizer (r `Or` s) =
  resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)

||| reserved words & symbols.
||| the tokens recognised by ``a `Or` b`` will be `Reserved a`.
||| e.g. `=>` in the input (if not part of a longer name)
||| will be returned as `Reserved "⇒"`.
public export
reserved : List Reserved
reserved =
  [Punc1 '(', Punc1 ')', Punc1 '[', Punc1 ']', Punc1 '{', Punc1 '}',
   Punc1 ',', Punc1 ';',
   Sym1 "@",
   Sym1 ":",
   Sym "⇒" `Or` Sym "=>",
   Sym "→" `Or` Sym "->",
   Sym "×" `Or` Sym "**",
   Sym "≡" `Or` Sym "==",
   Sym "∷" `Or` Sym "::",
   Sym "·" `Or` Punc '.',
   Word1 "case",
   Word1 "case1",
   Word "caseω" `Or` Word "case#",
   Word1 "return",
   Word1 "of",
   Word1 "_",
   Word1 "Eq",
   Word "λ" `Or` Word "fun",
   Word "δ" `Or` Word "dfun",
   Word "ω" `Or` Sym "#",
   Sym "★" `Or` Word "Type",
   Word1 "def",
   Word1 "def0",
   Word "defω" `Or` Word "def#",
   Sym "≔" `Or` Sym ":="]

||| `IsReserved str` is true if `Reserved str` might actually show up in
||| the token stream
public export
IsReserved : String -> Type
IsReserved str = str `Elem` map resString reserved

export
tokens : Tokenizer TokenW
tokens = choice $
  map skip [pred isWhitespace,
            lineComment (exact "--" <+> reject symCont),
            blockComment (exact "{-") (exact "-}")] <+>
  [universe] <+>  -- ★ᵢ takes precedence over bare ★
  map resTokenizer reserved <+>
  [nat, string, tag, name]

export
lex : String -> Either Error (List (WithBounds Token))
lex str =
  let (res, reason, line, col, str) = lex tokens str in
  case reason of
    EndInput => Right $ mapMaybe sequence res
    _        => Left  $ Err {reason, line, col, char = index 0 str}