unicode stuff

This commit is contained in:
rhiannon morris 2023-02-27 07:27:27 +01:00
parent 28356200c1
commit cacb3225a2
3 changed files with 154 additions and 140 deletions

153
lib/Quox/Unicode.idr Normal file
View file

@ -0,0 +1,153 @@
module Quox.Unicode
import Generics.Derive
%default total
%language ElabReflection
namespace Letter
public export
data Letter = Uppercase | Lowercase | Titlecase | Modifier | Other
%runElab derive "Letter" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Mark
public export
data Mark = NonSpacing | SpacingCombining | Enclosing
%runElab derive "Mark" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Number
public export
data Number = Decimal | Letter | Other
%runElab derive "Number" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Punctuation
public export
data Punctuation = Connector | Dash | Open | Close
| InitialQuote | FinalQuote | Other
%runElab derive "Punctuation" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Symbol
public export
data Symbol = Math | Currency | Modifier | Other
%runElab derive "Symbol" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Separator
public export
data Separator = Space | Line | Paragraph
%runElab derive "Separator" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Other
public export
data Other = Control | Format | Surrogate | PrivateUse | NotAssigned
%runElab derive "Other" [Generic, Meta, Eq, Ord, DecEq, Show]
public export
data GeneralCategory
= Letter Letter
| Mark Mark
| Number Number
| Punctuation Punctuation
| Symbol Symbol
| Separator Separator
| Other Other
%runElab derive "GeneralCategory" [Generic, Meta, Eq, Ord, DecEq, Show]
private
%foreign "scheme:(lambda (c) (symbol->string (char-general-category c)))"
prim__genCat : Char -> String
export
genCat : Char -> GeneralCategory
genCat ch = assert_total $
case prim__genCat ch of
"Lu" => Letter Uppercase
"Ll" => Letter Lowercase
"Lt" => Letter Titlecase
"Lm" => Letter Modifier
"Lo" => Letter Other
"Mn" => Mark NonSpacing
"Mc" => Mark SpacingCombining
"Me" => Mark Enclosing
"Nd" => Number Decimal
"Nl" => Number Letter
"No" => Number Other
"Pc" => Punctuation Connector
"Pd" => Punctuation Dash
"Ps" => Punctuation Open
"Pe" => Punctuation Close
"Pi" => Punctuation InitialQuote
"Pf" => Punctuation FinalQuote
"Po" => Punctuation Other
"Sm" => Symbol Math
"Sc" => Symbol Currency
"Sk" => Symbol Modifier
"So" => Symbol Other
"Zs" => Separator Space
"Zl" => Separator Line
"Zp" => Separator Paragraph
"Cc" => Other Control
"Cf" => Other Format
"Cs" => Other Surrogate
"Co" => Other PrivateUse
"Cn" => Other NotAssigned
cat => idris_crash #"Quox.Unicode.genCat: unknown category "\{cat}""#
namespace GeneralCategory
public export %inline
isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator, isOther :
GeneralCategory -> Bool
isLetter = \case Letter _ => True; _ => False
isMark = \case Mark _ => True; _ => False
isNumber = \case Number _ => True; _ => False
isPunctuation = \case Punctuation _ => True; _ => False
isSymbol = \case Symbol _ => True; _ => False
isSeparator = \case Separator _ => True; _ => False
isOther = \case Other _ => True; _ => False
namespace Char
public export %inline
isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator, isOther :
Char -> Bool
isLetter = isLetter . genCat
isMark = isMark . genCat
isNumber = isNumber . genCat
isPunctuation = isPunctuation . genCat
isSymbol = isSymbol . genCat
isSeparator = isSeparator . genCat
isOther = isOther . genCat
export
isIdStart : Char -> Bool
isIdStart ch = isLetter ch || isNumber ch && not ('0' <= ch && ch <= '9')
export
isIdCont : Char -> Bool
isIdCont ch = isIdStart ch || ch == '\'' || isMark ch || isNumber ch
export
isIdConnector : Char -> Bool
isIdConnector ch = genCat ch == Punctuation Connector
export
isSymChar : Char -> Bool
isSymChar ch =
case genCat ch of
Symbol _ => True
Punctuation Dash => True
Punctuation Other => True
_ => False
export
isWhitespace : Char -> Bool
isWhitespace ch = ch == '\t' || ch == '\r' || ch == '\n' || isSeparator ch
export
%foreign "scheme:string-normalize-nfc"
normalizeNfc : String -> String

View file

@ -1,140 +0,0 @@
module Quox.Unicode
import Generics.Derive
%default total
%language ElabReflection
namespace Letter
public export
data Letter = Uppercase | Lowercase | Titlecase | Modifier | Other
%runElab derive "Letter" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Mark
public export
data Mark = NonSpacing | SpacingCombining | Enclosing
%runElab derive "Mark" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Number
public export
data Number = Decimal | Letter | Other
%runElab derive "Number" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Punctuation
public export
data Punctuation = Connector | Dash | Open | Close
| InitialQuote | FinalQuote | Other
%runElab derive "Punctuation" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Symbol
public export
data Symbol = Math | Currency | Modifier | Other
%runElab derive "Symbol" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Separator
public export
data Separator = Space | Line | Paragraph
%runElab derive "Separator" [Generic, Meta, Eq, Ord, DecEq, Show]
namespace Other
public export
data Other = Control | Format | Surrogate | PrivateUse | NotAssigned
%runElab derive "Other" [Generic, Meta, Eq, Ord, DecEq, Show]
public export
data GeneralCategory
= Letter Letter
| Mark Mark
| Number Number
| Punctuation Punctuation
| Symbol Symbol
| Separator Separator
| Other Other
%runElab derive "GeneralCategory" [Generic, Meta, Eq, Ord, DecEq, Show]
private
%foreign "scheme:(lambda (c) (symbol->string (char-general-category c)))"
prim__genCat : Char -> String
export
genCat : Char -> GeneralCategory
genCat ch = assert_total $
case prim__genCat ch of
"Lu" => Letter Uppercase
"Ll" => Letter Lowercase
"Lt" => Letter Titlecase
"Lm" => Letter Modifier
"Lo" => Letter Other
"Mn" => Mark NonSpacing
"Mc" => Mark SpacingCombining
"Me" => Mark Enclosing
"Nd" => Number Decimal
"Nl" => Number Letter
"No" => Number Other
"Pc" => Punctuation Connector
"Pd" => Punctuation Dash
"Ps" => Punctuation Open
"Pe" => Punctuation Close
"Pi" => Punctuation InitialQuote
"Pf" => Punctuation FinalQuote
"Po" => Punctuation Other
"Sm" => Symbol Math
"Sc" => Symbol Currency
"Sk" => Symbol Modifier
"So" => Symbol Other
"Zs" => Separator Space
"Zl" => Separator Line
"Zp" => Separator Paragraph
"Cc" => Other Control
"Cf" => Other Format
"Cs" => Other Surrogate
"Co" => Other PrivateUse
"Cn" => Other NotAssigned
export
isIdStart : Char -> Bool
isIdStart ch =
case genCat ch of
Letter _ => True
Number _ => not ('0' <= ch && ch <= '9')
_ => False
export
isIdCont : Char -> Bool
isIdCont ch =
isIdStart ch || ch == '\'' ||
case genCat ch of
Mark _ => True
Number _ => True
_ => False
export
isIdConnector : Char -> Bool
isIdConnector ch =
case genCat ch of Punctuation Connector => True; _ => False
export
isSymChar : Char -> Bool
isSymChar ch =
case genCat ch of
Symbol _ => True
Punctuation Dash => True
Punctuation Other => True
_ => False
export
isWhitespace : Char -> Bool
isWhitespace ch =
ch == '\t' || ch == '\r' || ch == '\n' ||
case genCat ch of Separator _ => True; _ => False
export
%foreign "scheme:string-normalize-nfc"
normalizeNfc : String -> String

View file

@ -11,6 +11,7 @@ modules =
Quox.NatExtra,
Quox.BoolExtra,
Quox.Decidable,
Quox.Unicode,
Quox.No,
Quox.Pretty,
Quox.Syntax,