quox/lib/Quox/Parser/Lexer.idr

module Quox.Parser.Lexer

import Quox.CharExtra
import Quox.NatExtra
import Quox.Name
import Data.String.Extra
import Data.SortedMap
import public Data.String -- for singleton to reduce in IsReserved
import public Data.List.Elem
import public Text.Lexer
import public Text.Lexer.Tokenizer
import Derive.Prelude
%hide TT.Name


%default total
%language ElabReflection


||| @ Reserved reserved token
||| @ Name name, possibly qualified
||| @ Nat nat literal
||| @ Str string literal
||| @ Tag tag literal
||| @ TYPE "Type" or "★" with ascii nat directly after
||| @ Sup superscript or ^ number (displacement, or universe for ★)
public export
data Token =
    Reserved String
  | Name PName
  | Nat Nat
  | Str String
  | Tag String
  | TYPE Nat
  | Sup Nat
%runElab derive "Token" [Eq, Ord, Show]

||| token or whitespace
||| @ Skip whitespace, comments, etc
||| @ Invalid a token which failed a post-lexer check
|||           (e.g. a qualified name containing a keyword)
||| @ T a well formed token
public export
data ExtToken = Skip | Invalid String String | T Token
%runElab derive "ExtToken" [Eq, Ord, Show]


public export
data ErrorReason =
    NoRuleApply
  | ComposeNotClosing (Int, Int) (Int, Int)
  | Other String
%runElab derive "ErrorReason" [Eq, Ord, Show]

public export
record Error where
  constructor Err
  reason    : ErrorReason
  line, col : Int
  ||| `Nothing` if the error is at the end of the input
  char      : Maybe Char
%runElab derive "StopReason" [Eq, Ord, Show]
%runElab derive "Error"      [Eq, Ord, Show]


private
skip : Lexer -> Tokenizer ExtToken
skip t = match t $ const Skip

private
tmatch : Lexer -> (String -> Token) -> Tokenizer ExtToken
tmatch t f = match t (T . f)


export
fromStringLit : (String -> Token) -> String -> ExtToken
fromStringLit f str =
  case go $ unpack $ drop 1 $ dropLast 1 str of
    Left  err => Invalid err str
    Right ok  => T $ f $ pack ok
where
  Interpolation Char where interpolate = singleton

  go, hexEscape : List Char -> Either String (List Char)

  go []                   = Right []
  go ['\\']               = Left "string ends with \\"
  go ('\\' :: 'n'  :: cs) = ('\n' ::) <$> go cs
  go ('\\' :: 't'  :: cs) = ('\t' ::) <$> go cs
  go ('\\' :: 'x'  :: cs) = hexEscape cs
  go ('\\' :: 'X'  :: cs) = hexEscape cs
  go ('\\' :: '\\' :: cs) = ('\\' ::) <$> go cs
  go ('\\' :: '"'  :: cs) = ('"'  ::) <$> go cs
  -- [todo] others
  go ('\\' :: c :: _)     = Left "unknown escape '\{c}'"
  go (c :: cs)            = (c ::) <$> go cs

  hexEscape cs =
    case break (== ';') cs of
      (hs, ';' :: rest) => do
        let hs = pack hs
        let Just c = Int.fromHex hs
          | Nothing => Left #"invalid hex string "\#{hs}" in escape"#
        if isCodepoint c
           then (chr c ::) <$> go (assert_smaller cs rest)
           else Left "codepoint \{hs} out of range"
      _ => Left "unterminated hex escape"

private
string : Tokenizer ExtToken
string = match stringLit $ fromStringLit Str


%hide binLit
%hide octLit
%hide hexLit

private
nat : Tokenizer ExtToken
nat = match  hexLit fromHexLit
  <|> tmatch decLit fromDecLit
where
  withUnderscores : Lexer -> Lexer
  withUnderscores l = l <+> many (opt (is '_') <+> l)

  withoutUnderscores : String -> String
  withoutUnderscores = pack . go . unpack where
    go : List Char -> List Char
    go []          = []
    go ('_' :: cs) =      go cs
    go (c   :: cs) = c :: go cs

  decLit =
    withUnderscores (range '0' '9') <+> reject idContEnd

  hexLit =
    approx "0x" <+>
    withUnderscores (range '0' '9' <|> range 'a' 'f' <|> range 'A' 'F') <+>
    reject idContEnd

  fromDecLit : String -> Token
  fromDecLit = Nat . cast . withoutUnderscores

  fromHexLit : String -> ExtToken
  fromHexLit str =
    maybe (Invalid "invalid hex sequence" str) (T . Nat) $
    fromHex $ withoutUnderscores $ drop 2 str


private
tag : Tokenizer ExtToken
tag = tmatch (is '\'' <+> name)      (Tag . drop 1)
  <|>  match (is '\'' <+> stringLit) (fromStringLit Tag . drop 1)


private %inline
fromSup : Char -> Char
fromSup c = case c of
  '⁰' => '0'; '¹' => '1'; '²' => '2'; '³' => '3'; '⁴' => '4'
  '⁵' => '5'; '⁶' => '6'; '⁷' => '7'; '⁸' => '8'; '⁹' => '9'; _ => c

private %inline
supToNat : String -> Nat
supToNat = cast . pack . map fromSup . unpack

-- ★0, Type0. base ★/Type is a Reserved and ★¹/Type¹ are sequences of two tokens
private
universe : Tokenizer ExtToken
universe = universeWith "★" <|> universeWith "Type" where
  universeWith : String -> Tokenizer ExtToken
  universeWith pfx =
    let len = length pfx in
    tmatch (exact pfx <+> digits) (TYPE . cast . drop len)

private
sup : Tokenizer ExtToken
sup = tmatch (some $ pred isSupDigit) (Sup . supToNat)
  <|> tmatch (is '^' <+> digits)      (Sup . cast . drop 1)


private %inline
choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
choice (t :: ts) = foldl (\a, b => a <|> b) t ts


namespace Reserved
  ||| description of a reserved symbol
  ||| @ Word a reserved word (must not be followed by letters, digits, etc)
  ||| @ Sym a reserved symbol (must not be followed by symbolic chars)
  ||| @ Punc a character that doesn't show up in names (brackets, etc);
  |||        also a sequence ending in one of those, like `#[`, since the
  |||        difference relates to lookahead
  public export
  data Reserved1 = Word String | Sym String | Punc String
  %runElab derive "Reserved1" [Eq, Ord, Show]

  ||| description of a token that might have unicode & ascii-only aliases
  public export
  data Reserved = Only Reserved1 | Or Reserved1 Reserved1
  %runElab derive "Reserved" [Eq, Ord, Show]

  public export
  Sym1, Word1, Punc1 : String -> Reserved
  Sym1  = Only . Sym
  Word1 = Only . Word
  Punc1 = Only . Punc

public export
resString1 : Reserved1 -> String
resString1 (Punc x) = x
resString1 (Word w) = w
resString1 (Sym  s) = s

||| return the representative string for a token description. if there are
||| two, then it's the first one, which should be the full-unicode one
public export
resString : Reserved -> String
resString (Only r)   = resString1 r
resString (r `Or` _) = resString1 r

||| return both representative strings for a token description
public export
resString2 : Reserved -> List String
resString2 (Only r)   = [resString1 r]
resString2 (r `Or` s) = [resString1 r, resString1 s]

private
resTokenizer1 : Reserved1 -> String -> Tokenizer ExtToken
resTokenizer1 r str =
  let res : String -> Token := const $ Reserved str in
  case r of Word w => tmatch (exact w <+> reject idContEnd) res
            Sym  s => tmatch (exact s <+> reject symCont)   res
            Punc x => tmatch (exact x)                      res

||| match a reserved token
export
resTokenizer : Reserved -> Tokenizer ExtToken
resTokenizer (Only r)   = resTokenizer1 r (resString1 r)
resTokenizer (r `Or` s) =
  resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)

||| reserved words & symbols.
||| the tokens recognised by ``a `Or` b`` will be `Reserved a`.
||| e.g. `=>` in the input (if not part of a longer name)
||| will be returned as `Reserved "⇒"`.
public export
reserved : List Reserved
reserved =
  [Punc1 "(", Punc1 ")", Punc1 "[", Punc1 "]", Punc1 "{", Punc1 "}",
   Punc1 ",", Punc1 ";", Punc1 "#[",
   Sym1 "@",
   Sym1 ":",
   Sym "⇒" `Or` Sym "=>",
   Sym "→" `Or` Sym "->",
   Sym "×" `Or` Sym "**",
   Sym "≡" `Or` Sym "==",
   Sym "∷" `Or` Sym "::",
   Punc1 ".",
   Word1 "case",
   Word1 "case0", Word1 "case1",
   Word "caseω" `Or` Word "case#",
   Word1 "return",
   Word1 "of",
   Word1 "fst", Word1 "snd",
   Word1 "_",
   Word1 "Eq",
   Word "λ" `Or` Word "fun",
   Word "δ" `Or` Word "dfun",
   Word "ω" `Or` Sym "#",
   Sym "★" `Or` Word "Type",
   Word "ℕ" `Or` Word "Nat",
   Word1 "IOState",
   Word1 "String",
   Word1 "zero", Word1 "succ",
   Word1 "coe", Word1 "comp",
   Word1 "def",
   Word1 "def0",
   Word "defω" `Or` Word "def#",
   Word1 "postulate",
   Word1 "postulate0",
   Word "postulateω" `Or` Word "postulate#",
   Sym1 "=",
   Word1 "load",
   Word1 "namespace"]

public export
reservedStrings : List String
reservedStrings = map resString reserved

public export
allReservedStrings : List String
allReservedStrings = foldMap resString2 reserved

||| `IsReserved str` is true if `Reserved str` might actually show up in
||| the token stream
public export
IsReserved : String -> Type
IsReserved str = So (str `elem` reservedStrings)

private
name : Tokenizer ExtToken
name =
  match name $ \str =>
    let parts = split (== '.') $ normalizeNfc str in
    case find (`elem` allReservedStrings) (toList parts) of
      Nothing => T $ Name $ fromListP parts
      Just w  => Invalid "reserved word '\{w}' inside name \{str}" str

export
tokens : Tokenizer ExtToken
tokens = choice $
  map skip [pred isWhitespace,
            lineComment (exact "--" <+> reject symCont),
            blockComment (exact "{-") (exact "-}")] <+>
  [universe] <+>  -- Type<i> takes precedence over bare Type
  map resTokenizer reserved <+>
  [sup, nat, string, tag, name]

export
check : Alternative f =>
        WithBounds ExtToken -> Either Error (f (WithBounds Token))
check (MkBounded val irr bounds@(MkBounds line col _ _)) = case val of
  Skip            => Right empty
  T tok           => Right $ pure $ MkBounded tok irr bounds
  Invalid msg tok => Left  $ Err (Other msg) line col (index 0 tok)

export
toErrorReason : StopReason -> Maybe ErrorReason
toErrorReason EndInput                = Nothing
toErrorReason NoRuleApply             = Just NoRuleApply
toErrorReason (ComposeNotClosing s e) = Just $ ComposeNotClosing s e

export
lex : String -> Either Error (List (WithBounds Token))
lex str =
  let (res, reason, line, col, str) = lex tokens str in
  case toErrorReason reason of
    Nothing => concatMap check res @{MonoidApplicative}
    Just e  => Left $ Err {reason = e, line, col, char = index 0 str}
-												more parser/FromParser stuff

- top level semicolons optional
- type optional [the def will need to be an elim]
- `load` statement
- namespaces

											
										
										
											2023-03-12 13:28:37 -04:00
+								module Quox.Parser.Lexer
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
 								import Quox.CharExtra
-												string/nat lit stuff

											
										
										
											2023-11-05 09:38:13 -05:00
+								import Quox.NatExtra
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								import Quox.Name
 								import Data.String.Extra
 								import Data.SortedMap
-												fix weird IsReserved issue

											
										
										
											2023-03-06 06:04:43 -05:00
+								import public Data.String -- for singleton to reduce in IsReserved
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								import public Data.List.Elem
 								import public Text.Lexer
 								import public Text.Lexer.Tokenizer
-												sop → elab-util

											
										
										
											2023-03-02 13:52:32 -05:00
+								import Derive.Prelude
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								%hide TT.Name
 								%default total
 								%language ElabReflection
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								||| @ Reserved reserved token
 								||| @ Name name, possibly qualified
 								||| @ Nat nat literal
-												fix constructor name in comment

											
										
										
											2023-11-02 13:14:28 -04:00
+								||| @ Str string literal
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								||| @ Tag tag literal
-												crude but effective stratification

											
										
										
											2023-05-21 14:09:34 -04:00
+								||| @ TYPE "Type" or "★" with ascii nat directly after
 								||| @ Sup superscript or ^ number (displacement, or universe for ★)
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								public export
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								data Token =
 								    Reserved String
-												more parser/FromParser stuff

- top level semicolons optional
- type optional [the def will need to be an elim]
- `load` statement
- namespaces

											
										
										
											2023-03-12 13:28:37 -04:00
+								  | Name PName
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								  | Nat Nat
 								  | Str String
 								  | Tag String
 								  | TYPE Nat
-												crude but effective stratification

											
										
										
											2023-05-21 14:09:34 -04:00
+								  | Sup Nat
-												sop → elab-util

											
										
										
											2023-03-02 13:52:32 -05:00
+								%runElab derive "Token" [Eq, Ord, Show]
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								||| token or whitespace
 								||| @ Skip whitespace, comments, etc
 								||| @ Invalid a token which failed a post-lexer check
 								|||           (e.g. a qualified name containing a keyword)
 								||| @ T a well formed token
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								public export
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								data ExtToken = Skip | Invalid String String | T Token
 								%runElab derive "ExtToken" [Eq, Ord, Show]
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								public export
 								data ErrorReason =
 								    NoRuleApply
 								  | ComposeNotClosing (Int, Int) (Int, Int)
 								  | Other String
 								%runElab derive "ErrorReason" [Eq, Ord, Show]
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								public export
 								record Error where
 								  constructor Err
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								  reason    : ErrorReason
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								  line, col : Int
 								  ||| `Nothing` if the error is at the end of the input
 								  char      : Maybe Char
-												sop → elab-util

											
										
										
											2023-03-02 13:52:32 -05:00
+								%runElab derive "StopReason" [Eq, Ord, Show]
 								%runElab derive "Error"      [Eq, Ord, Show]
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								private
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								skip : Lexer -> Tokenizer ExtToken
 								skip t = match t $ const Skip
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								private
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								tmatch : Lexer -> (String -> Token) -> Tokenizer ExtToken
 								tmatch t f = match t (T . f)
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								export
-												string/nat lit stuff

											
										
										
											2023-11-05 09:38:13 -05:00
+								fromStringLit : (String -> Token) -> String -> ExtToken
 								fromStringLit f str =
 								  case go $ unpack $ drop 1 $ dropLast 1 str of
 								    Left  err => Invalid err str
 								    Right ok  => T $ f $ pack ok
 								where
 								  Interpolation Char where interpolate = singleton
 								  go, hexEscape : List Char -> Either String (List Char)
 								  go []                   = Right []
 								  go ['\\']               = Left "string ends with \\"
 								  go ('\\' :: 'n'  :: cs) = ('\n' ::) <$> go cs
 								  go ('\\' :: 't'  :: cs) = ('\t' ::) <$> go cs
 								  go ('\\' :: 'x'  :: cs) = hexEscape cs
 								  go ('\\' :: 'X'  :: cs) = hexEscape cs
 								  go ('\\' :: '\\' :: cs) = ('\\' ::) <$> go cs
 								  go ('\\' :: '"'  :: cs) = ('"'  ::) <$> go cs
-												add \n and \t escapes to the lexer

											
										
										
											2023-11-03 15:07:59 -04:00
+								  -- [todo] others
-												string/nat lit stuff

											
										
										
											2023-11-05 09:38:13 -05:00
+								  go ('\\' :: c :: _)     = Left "unknown escape '\{c}'"
 								  go (c :: cs)            = (c ::) <$> go cs
 								  hexEscape cs =
 								    case break (== ';') cs of
 								      (hs, ';' :: rest) => do
 								        let hs = pack hs
 								        let Just c = Int.fromHex hs
 								          | Nothing => Left #"invalid hex string "\#{hs}" in escape"#
 								        if isCodepoint c
 								           then (chr c ::) <$> go (assert_smaller cs rest)
 								           else Left "codepoint \{hs} out of range"
 								      _ => Left "unterminated hex escape"
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								private
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								string : Tokenizer ExtToken
-												string/nat lit stuff

											
										
										
											2023-11-05 09:38:13 -05:00
+								string = match stringLit $ fromStringLit Str
 								%hide binLit
 								%hide octLit
 								%hide hexLit
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								private
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								nat : Tokenizer ExtToken
-												string/nat lit stuff

											
										
										
											2023-11-05 09:38:13 -05:00
+								nat = match  hexLit fromHexLit
 								  <|> tmatch decLit fromDecLit
 								where
 								  withUnderscores : Lexer -> Lexer
 								  withUnderscores l = l <+> many (opt (is '_') <+> l)
 								  withoutUnderscores : String -> String
 								  withoutUnderscores = pack . go . unpack where
 								    go : List Char -> List Char
 								    go []          = []
 								    go ('_' :: cs) =      go cs
 								    go (c   :: cs) = c :: go cs
 								  decLit =
 								    withUnderscores (range '0' '9') <+> reject idContEnd
 								  hexLit =
 								    approx "0x" <+>
 								    withUnderscores (range '0' '9' <|> range 'a' 'f' <|> range 'A' 'F') <+>
 								    reject idContEnd
 								  fromDecLit : String -> Token
 								  fromDecLit = Nat . cast . withoutUnderscores
 								  fromHexLit : String -> ExtToken
 								  fromHexLit str =
 								    maybe (Invalid "invalid hex sequence" str) (T . Nat) $
 								    fromHex $ withoutUnderscores $ drop 2 str
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								private
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								tag : Tokenizer ExtToken
 								tag = tmatch (is '\'' <+> name)      (Tag . drop 1)
-												string/nat lit stuff

											
										
										
											2023-11-05 09:38:13 -05:00
+								  <|>  match (is '\'' <+> stringLit) (fromStringLit Tag . drop 1)
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												crude but effective stratification

											
										
										
											2023-05-21 14:09:34 -04:00
+								private %inline
 								fromSup : Char -> Char
 								fromSup c = case c of
 								  '⁰' => '0'; '¹' => '1'; '²' => '2'; '³' => '3'; '⁴' => '4'
 								  '⁵' => '5'; '⁶' => '6'; '⁷' => '7'; '⁸' => '8'; '⁹' => '9'; _ => c
 								private %inline
 								supToNat : String -> Nat
 								supToNat = cast . pack . map fromSup . unpack
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												fix some comments

											
										
										
											2023-11-05 09:41:21 -05:00
+								-- ★0, Type0. base ★/Type is a Reserved and ★¹/Type¹ are sequences of two tokens
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								private
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								universe : Tokenizer ExtToken
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								universe = universeWith "★" <|> universeWith "Type" where
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								  universeWith : String -> Tokenizer ExtToken
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								  universeWith pfx =
 								    let len = length pfx in
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								    tmatch (exact pfx <+> digits) (TYPE . cast . drop len)
-												crude but effective stratification

											
										
										
											2023-05-21 14:09:34 -04:00
 								private
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								sup : Tokenizer ExtToken
 								sup = tmatch (some $ pred isSupDigit) (Sup . supToNat)
 								  <|> tmatch (is '^' <+> digits)      (Sup . cast . drop 1)
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								private %inline
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								choice : (xs : List (Tokenizer a)) -> (0 _ : NonEmpty xs) => Tokenizer a
 								choice (t :: ts) = foldl (\a, b => a <|> b) t ts
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								namespace Reserved
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								  ||| description of a reserved symbol
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								  ||| @ Word a reserved word (must not be followed by letters, digits, etc)
 								  ||| @ Sym a reserved symbol (must not be followed by symbolic chars)
-												change it to #[..] since # is also reserved

											
										
										
											2023-09-22 12:38:40 -04:00
+								  ||| @ Punc a character that doesn't show up in names (brackets, etc);
 								  |||        also a sequence ending in one of those, like `#[`, since the
 								  |||        difference relates to lookahead
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								  public export
-												change it to #[..] since # is also reserved

											
										
										
											2023-09-22 12:38:40 -04:00
+								  data Reserved1 = Word String | Sym String | Punc String
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								  %runElab derive "Reserved1" [Eq, Ord, Show]
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
 								  ||| description of a token that might have unicode & ascii-only aliases
 								  public export
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								  data Reserved = Only Reserved1 | Or Reserved1 Reserved1
 								  %runElab derive "Reserved" [Eq, Ord, Show]
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
 								  public export
-												change it to #[..] since # is also reserved

											
										
										
											2023-09-22 12:38:40 -04:00
+								  Sym1, Word1, Punc1 : String -> Reserved
 								  Sym1  = Only . Sym
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								  Word1 = Only . Word
 								  Punc1 = Only . Punc
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
 								public export
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								resString1 : Reserved1 -> String
-												change it to #[..] since # is also reserved

											
										
										
											2023-09-22 12:38:40 -04:00
+								resString1 (Punc x) = x
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								resString1 (Word w) = w
 								resString1 (Sym  s) = s
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
 								||| return the representative string for a token description. if there are
 								||| two, then it's the first one, which should be the full-unicode one
 								public export
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								resString : Reserved -> String
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								resString (Only r)   = resString1 r
 								resString (r `Or` _) = resString1 r
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								||| return both representative strings for a token description
 								public export
 								resString2 : Reserved -> List String
 								resString2 (Only r)   = [resString1 r]
 								resString2 (r `Or` s) = [resString1 r, resString1 s]
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								private
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								resTokenizer1 : Reserved1 -> String -> Tokenizer ExtToken
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								resTokenizer1 r str =
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								  let res : String -> Token := const $ Reserved str in
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								  case r of Word w => tmatch (exact w <+> reject idContEnd) res
 								            Sym  s => tmatch (exact s <+> reject symCont)   res
 								            Punc x => tmatch (exact x)                      res
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
 								||| match a reserved token
 								export
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								resTokenizer : Reserved -> Tokenizer ExtToken
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								resTokenizer (Only r)   = resTokenizer1 r (resString1 r)
 								resTokenizer (r `Or` s) =
 								  resTokenizer1 r (resString1 r) <|> resTokenizer1 s (resString1 r)
 								||| reserved words & symbols.
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								||| the tokens recognised by ``a `Or` b`` will be `Reserved a`.
 								||| e.g. `=>` in the input (if not part of a longer name)
 								||| will be returned as `Reserved "⇒"`.
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								public export
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								reserved : List Reserved
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								reserved =
-												change it to #[..] since # is also reserved

											
										
										
											2023-09-22 12:38:40 -04:00
+								  [Punc1 "(", Punc1 ")", Punc1 "[", Punc1 "]", Punc1 "{", Punc1 "}",
 								   Punc1 ",", Punc1 ";", Punc1 "#[",
 								   Sym1 "@",
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								   Sym1 ":",
 								   Sym "⇒" `Or` Sym "=>",
 								   Sym "→" `Or` Sym "->",
 								   Sym "×" `Or` Sym "**",
 								   Sym "≡" `Or` Sym "==",
 								   Sym "∷" `Or` Sym "::",
-												change it to #[..] since # is also reserved

											
										
										
											2023-09-22 12:38:40 -04:00
+								   Punc1 ".",
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								   Word1 "case",
-												add `case0` to syntax since that is possible sometimes

											
										
										
											2023-04-02 09:22:39 -04:00
+								   Word1 "case0", Word1 "case1",
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								   Word "caseω" `Or` Word "case#",
 								   Word1 "return",
 								   Word1 "of",
-												add fst and snd

											
										
										
											2023-09-18 15:52:51 -04:00
+								   Word1 "fst", Word1 "snd",
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								   Word1 "_",
 								   Word1 "Eq",
 								   Word "λ" `Or` Word "fun",
 								   Word "δ" `Or` Word "dfun",
 								   Word "ω" `Or` Sym "#",
 								   Sym "★" `Or` Word "Type",
-												natural numbers

											
										
										
											2023-03-26 08:40:54 -04:00
+								   Word "ℕ" `Or` Word "Nat",
-												always vsep scheme lets, otherwise they are unreadable

											
										
										
											2023-11-01 10:17:15 -04:00
+								   Word1 "IOState",
 								   Word1 "String",
-												natural numbers

											
										
										
											2023-03-26 08:40:54 -04:00
+								   Word1 "zero", Word1 "succ",
-												coercions and compositions

											
										
										
											2023-04-15 09:13:01 -04:00
+								   Word1 "coe", Word1 "comp",
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								   Word1 "def",
 								   Word1 "def0",
 								   Word "defω" `Or` Word "def#",
-												add postulate, #[compile-scheme], #[main]

											
										
										
											2023-11-01 07:56:27 -04:00
+								   Word1 "postulate",
 								   Word1 "postulate0",
 								   Word "postulateω" `Or` Word "postulate#",
-												replace '≔' and '·' with '=' and (only) '.'

											
										
										
											2023-03-17 16:54:09 -04:00
+								   Sym1 "=",
-												more parser/FromParser stuff

- top level semicolons optional
- type optional [the def will need to be an elim]
- `load` statement
- namespaces

											
										
										
											2023-03-12 13:28:37 -04:00
+								   Word1 "load",
 								   Word1 "namespace"]
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								public export
 								reservedStrings : List String
 								reservedStrings = map resString reserved
 								public export
 								allReservedStrings : List String
 								allReservedStrings = foldMap resString2 reserved
-												change some single-character constructor names

											
										
										
											2023-03-08 10:46:29 -05:00
+								||| `IsReserved str` is true if `Reserved str` might actually show up in
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								||| the token stream
 								public export
 								IsReserved : String -> Type
-												fix IsReserved

IsReserved should be true for e.g. "λ" but not "fun", since only the
first can show up in the lexer output

											
										
										
											2023-11-05 09:43:17 -05:00
+								IsReserved str = So (str `elem` reservedStrings)
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
 								private
 								name : Tokenizer ExtToken
 								name =
 								  match name $ \str =>
 								    let parts = split (== '.') $ normalizeNfc str in
 								    case find (`elem` allReservedStrings) (toList parts) of
 								      Nothing => T $ Name $ fromListP parts
 								      Just w  => Invalid "reserved word '\{w}' inside name \{str}" str
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								export
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								tokens : Tokenizer ExtToken
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								tokens = choice $
 								  map skip [pred isWhitespace,
 								            lineComment (exact "--" <+> reject symCont),
 								            blockComment (exact "{-") (exact "-}")] <+>
-												fix some comments

											
										
										
											2023-11-05 09:41:21 -05:00
+								  [universe] <+>  -- Type<i> takes precedence over bare Type
-												parser

											
										
										
											2023-03-04 15:02:51 -05:00
+								  map resTokenizer reserved <+>
-												crude but effective stratification

											
										
										
											2023-05-21 14:09:34 -04:00
+								  [sup, nat, string, tag, name]
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								export
 								check : Alternative f =>
 								        WithBounds ExtToken -> Either Error (f (WithBounds Token))
 								check (MkBounded val irr bounds@(MkBounds line col _ _)) = case val of
 								  Skip            => Right empty
 								  T tok           => Right $ pure $ MkBounded tok irr bounds
 								  Invalid msg tok => Left  $ Err (Other msg) line col (index 0 tok)
 								export
 								toErrorReason : StopReason -> Maybe ErrorReason
 								toErrorReason EndInput                = Nothing
 								toErrorReason NoRuleApply             = Just NoRuleApply
 								toErrorReason (ComposeNotClosing s e) = Just $ ComposeNotClosing s e
-												lexer

											
										
										
											2023-02-28 14:51:54 -05:00
+								export
 								lex : String -> Either Error (List (WithBounds Token))
 								lex str =
 								  let (res, reason, line, col, str) = lex tokens str in
-												detect reserved words inside names like 'a.λ.b'

											
										
										
											2023-09-24 11:36:20 -04:00
+								  case toErrorReason reason of
 								    Nothing => concatMap check res @{MonoidApplicative}
 								    Just e  => Left $ Err {reason = e, line, col, char = index 0 str}