quox/lib/Quox/CharExtra.idr

176 lines
4.6 KiB
Idris
Raw Normal View History

2023-02-28 14:51:54 -05:00
module Quox.CharExtra
2023-02-27 01:27:27 -05:00
2023-03-02 13:52:32 -05:00
import Derive.Prelude
2023-02-27 01:27:27 -05:00
%default total
%language ElabReflection
namespace Letter
public export
data Letter = Uppercase | Lowercase | Titlecase | Modifier | Other
2023-03-02 13:52:32 -05:00
%runElab derive "Letter" [Eq, Ord, Show]
2023-02-27 01:27:27 -05:00
namespace Mark
public export
data Mark = NonSpacing | SpacingCombining | Enclosing
2023-03-02 13:52:32 -05:00
%runElab derive "Mark" [Eq, Ord, Show]
2023-02-27 01:27:27 -05:00
namespace Number
public export
data Number = Decimal | Letter | Other
2023-03-02 13:52:32 -05:00
%runElab derive "Number" [Eq, Ord, Show]
2023-02-27 01:27:27 -05:00
namespace Punctuation
public export
data Punctuation = Connector | Dash | Open | Close
| InitialQuote | FinalQuote | Other
2023-03-02 13:52:32 -05:00
%runElab derive "Punctuation" [Eq, Ord, Show]
2023-02-27 01:27:27 -05:00
namespace Symbol
public export
data Symbol = Math | Currency | Modifier | Other
2023-03-02 13:52:32 -05:00
%runElab derive "Symbol" [Eq, Ord, Show]
2023-02-27 01:27:27 -05:00
namespace Separator
public export
data Separator = Space | Line | Paragraph
2023-03-02 13:52:32 -05:00
%runElab derive "Separator" [Eq, Ord, Show]
2023-02-27 01:27:27 -05:00
namespace Other
public export
data Other = Control | Format | Surrogate | PrivateUse | NotAssigned
2023-03-02 13:52:32 -05:00
%runElab derive "Other" [Eq, Ord, Show]
2023-02-27 01:27:27 -05:00
public export
data GeneralCategory
= Letter Letter
| Mark Mark
| Number Number
| Punctuation Punctuation
| Symbol Symbol
| Separator Separator
| Other Other
2023-03-02 13:52:32 -05:00
%runElab derive "GeneralCategory" [Eq, Ord, Show]
2023-02-27 01:27:27 -05:00
private
%foreign "scheme:(lambda (c) (symbol->string (char-general-category c)))"
prim__genCat : Char -> String
export
genCat : Char -> GeneralCategory
genCat ch = assert_total $
case prim__genCat ch of
"Lu" => Letter Uppercase
"Ll" => Letter Lowercase
"Lt" => Letter Titlecase
"Lm" => Letter Modifier
"Lo" => Letter Other
"Mn" => Mark NonSpacing
"Mc" => Mark SpacingCombining
"Me" => Mark Enclosing
"Nd" => Number Decimal
"Nl" => Number Letter
"No" => Number Other
"Pc" => Punctuation Connector
"Pd" => Punctuation Dash
"Ps" => Punctuation Open
"Pe" => Punctuation Close
"Pi" => Punctuation InitialQuote
"Pf" => Punctuation FinalQuote
"Po" => Punctuation Other
"Sm" => Symbol Math
"Sc" => Symbol Currency
"Sk" => Symbol Modifier
"So" => Symbol Other
"Zs" => Separator Space
"Zl" => Separator Line
"Zp" => Separator Paragraph
"Cc" => Other Control
"Cf" => Other Format
"Cs" => Other Surrogate
"Co" => Other PrivateUse
"Cn" => Other NotAssigned
cat => idris_crash #"Quox.Unicode.genCat: unknown category "\{cat}""#
namespace GeneralCategory
public export %inline
isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator, isOther :
GeneralCategory -> Bool
isLetter = \case Letter _ => True; _ => False
isMark = \case Mark _ => True; _ => False
isNumber = \case Number _ => True; _ => False
isPunctuation = \case Punctuation _ => True; _ => False
isSymbol = \case Symbol _ => True; _ => False
isSeparator = \case Separator _ => True; _ => False
isOther = \case Other _ => True; _ => False
namespace Char
public export %inline
isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator, isOther :
Char -> Bool
isLetter = isLetter . genCat
isMark = isMark . genCat
isNumber = isNumber . genCat
isPunctuation = isPunctuation . genCat
isSymbol = isSymbol . genCat
isSeparator = isSeparator . genCat
isOther = isOther . genCat
2023-05-21 14:09:34 -04:00
export
isSupDigit : Char -> Bool
isSupDigit ch = ch `elem` unpack "⁰¹²³⁴⁵⁶⁷⁸⁹"
export
isSubDigit : Char -> Bool
isSubDigit ch = ch `elem` unpack "₀₁₂₃₄₅₆₇₈₉"
export
isAsciiDigit : Char -> Bool
isAsciiDigit ch = '0' <= ch && ch <= '9'
2023-02-27 01:27:27 -05:00
export
isIdStart : Char -> Bool
2023-02-28 14:51:54 -05:00
isIdStart ch =
2023-05-21 14:09:34 -04:00
(ch == '_' || isLetter ch || isNumber ch) &&
not (isSupDigit ch || isAsciiDigit ch)
2023-02-27 01:27:27 -05:00
export
isIdCont : Char -> Bool
2023-02-28 14:51:54 -05:00
isIdCont ch =
2023-05-21 14:09:34 -04:00
(isIdStart ch || ch == '\'' || ch == '-' || isMark ch || isNumber ch) &&
not (isSupDigit ch)
2023-02-27 01:27:27 -05:00
export
isIdConnector : Char -> Bool
isIdConnector ch = genCat ch == Punctuation Connector
export
isSymChar : Char -> Bool
2023-02-28 14:51:54 -05:00
isSymChar ch = case genCat ch of
Symbol _ => True
Punctuation Dash => True
Punctuation Other => True
_ => False
2023-02-27 01:27:27 -05:00
export
isWhitespace : Char -> Bool
isWhitespace ch = ch == '\t' || ch == '\r' || ch == '\n' || isSeparator ch
export
%foreign "scheme:string-normalize-nfc"
normalizeNfc : String -> String
2023-11-05 09:38:13 -05:00
export
isCodepoint : Int -> Bool
isCodepoint n =
n <= 0x10FFFF &&
not (n >= 0xD800 && n <= 0xDBFF || n >= 0xDC00 && n <= 0xDFFF)