quox/lib/Quox/Lexer.idr

module Quox.Lexer

import Quox.CharExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
import Text.Lexer
import Text.Lexer.Tokenizer
import Derive.Prelude
%hide TT.Name


%default total
%language ElabReflection


||| @ R reserved token
||| @ I identifier
||| @ N nat literal
||| @ S string literal
||| @ T tag literal
||| @ TYPE "Type" or "★" with subscript
public export
data Token = R String | I Name | N Nat | S String | T String | TYPE Nat
%runElab derive "Token" [Eq, Ord, Show]

-- token or whitespace
private
0 TokenW : Type
TokenW = Maybe Token


public export
record Error where
  constructor Err
  reason    : StopReason
  line, col : Int
  ||| `Nothing` if the error is at the end of the input
  char      : Maybe Char
%runElab derive "StopReason" [Eq, Ord, Show]
%runElab derive "Error"      [Eq, Ord, Show]


skip : Lexer -> Tokenizer TokenW
skip t = match t $ const Nothing

export
syntaxChars : List Char
syntaxChars = ['(', ')', '[', ']', '{', '}', '`', '"', '\'', ',', '.']

export
isSymStart, isSymCont : Char -> Bool
isSymStart c = not (c `elem` syntaxChars) && isSymChar c
isSymCont  c = c == '\'' || isSymStart c

export
idStart, idCont, idEnd, idContEnd, symStart, symCont : Lexer
idStart   = pred isIdStart
idCont    = pred isIdCont
idEnd     = pred $ \c => c `elem` unpack "?!#"
idContEnd = idCont <|> idEnd
symStart  = pred isSymStart
symCont   = pred isSymCont

private %inline
resVal : String -> a -> Maybe Token
resVal str = const $ Just $ R str

export
resWordL, resSymL, syntaxL : String -> Lexer
resWordL str = exact str <+> reject idContEnd
resSymL  str = exact str <+> reject symCont
syntaxL  str = exact str

export
resWord, resSym, syntax : String -> Tokenizer TokenW
resWord str = match (resWordL str) (resVal str)
resSym  str = match (resSymL  str) (resVal str)
syntax  str = match (syntaxL  str) (resVal str)

-- return value contains unicode version
export
uresWord, uresSym, usyntax : (unicode, ascii : String) -> Tokenizer TokenW
uresWord u a = match (resWordL u <|> resWordL a) (resVal u)
uresSym  u a = match (resSymL u <|> resSymL a) (resVal u)
usyntax  u a = match (exact u <|> exact a) (resVal u)

export
alphaName, symName, baseNameL : Lexer
alphaName = idStart <+> many idCont <+> many idEnd
symName   = symStart <+> many symCont
baseNameL = symName <|> alphaName

export
baseName : Tokenizer BaseName
baseName = match baseNameL UN

private
toName : String -> Name
toName = fromList . split (== '.')

export
name : Tokenizer TokenW
name = match (baseNameL <+> many (is '.' <+> baseNameL))
             (Just . I . toName . normalizeNfc)

-- [todo] escapes other than \" and (accidentally) \\
export
fromStringLit : String -> String
fromStringLit = pack . go . unpack . drop 1 . dropLast 1 where
  go : List Char -> List Char
  go []                = []
  go ['\\']            = ['\\'] -- i guess???
  go ('\\' :: c :: cs) = c :: go cs
  go (c :: cs)         = c :: go cs

export
string : Tokenizer TokenW
string = match stringLit (Just . S . fromStringLit)

export
nat : Tokenizer TokenW
nat = match (some (range '0' '9')) (Just . N . cast)

export
tag : Tokenizer TokenW
tag = match (is '`' <+> baseNameL) (Just . T . drop 1)
  <|> match (is '`' <+> stringLit) (Just . T . fromStringLit . drop 1)


private %inline
fromSub : Char -> Char
fromSub c = case c of
  '₀' => '0'; '₁' => '1'; '₂' => '2'; '₃' => '3'; '₄' => '4'
  '₅' => '5'; '₆' => '6'; '₇' => '7'; '₈' => '8'; '₉' => '9'; _ => c

private %inline
subToNat : String -> Nat
subToNat = cast . pack . map fromSub . unpack


export
universe : Tokenizer TokenW
universe = universeWith "★" <|> universeWith "Type" where
  universeWith : String -> Tokenizer TokenW
  universeWith pfx =
    let len = length pfx in
    match (exact pfx <+> some (range '0' '9'))
          (Just . TYPE . cast . drop len) <|>
    match (exact pfx <+> some (range '₀' '₉'))
          (Just . TYPE . subToNat . drop len)


export
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
choice (t :: ts) = foldl (\a, b => a <|> b) t ts


export
tokens : Tokenizer TokenW
tokens = choice $
  map skip [pred isWhitespace,
            lineComment (exact "--" <+> reject symCont),
            blockComment (exact "{-") (exact "-}")] <+>
  map syntax ["(", ")", "[", "]", ",", "{", "}", "."] <+>
  [match (exact "`{" <+> reject (is '-')) (resVal "`{")] <+>
  map resSym ["@", ":"] <+>
  map (uncurry uresSym)
    [("·", "%"), ("→","->"), ("×", "**"), ("≡", "=="), ("∷", "::")] <+>
  map resWord ["case", "case1", "caseω", "return", "of", "_"] <+>
  map (uncurry uresWord) [("λ","fun"), ("δ","dfun"), ("caseω", "case#")] <+>
  [resWord "ω", match (resSymL "#") (resVal "ω"),
   universe, resSym "★", match (resWordL "Type") (resVal "★")] <+>
  [nat, string, tag, name]

export
lex : String -> Either Error (List (WithBounds Token))
lex str =
  let (res, reason, line, col, str) = lex tokens str in
  case reason of
    EndInput => Right $ mapMaybe sequence res
    _        => Left  $ Err {reason, line, col, char = index 0 str}