diff --git a/lib/Quox/CharExtra.idr b/lib/Quox/Unicode.idr similarity index 97% rename from lib/Quox/CharExtra.idr rename to lib/Quox/Unicode.idr index 52b9fa0..bcaf956 100644 --- a/lib/Quox/CharExtra.idr +++ b/lib/Quox/Unicode.idr @@ -1,4 +1,4 @@ -module Quox.CharExtra +module Quox.Unicode import Generics.Derive @@ -133,3 +133,8 @@ isWhitespace : Char -> Bool isWhitespace ch = ch == '\t' || ch == '\r' || ch == '\n' || case genCat ch of Separator _ => True; _ => False + + +export +%foreign "scheme:string-normalize-nfc" +normalizeNfc : String -> String diff --git a/lib/quox-lib.ipkg b/lib/quox-lib.ipkg index dd5a02a..e31db43 100644 --- a/lib/quox-lib.ipkg +++ b/lib/quox-lib.ipkg @@ -9,7 +9,7 @@ depends = base, contrib, elab-util, sop, snocvect modules = Quox.NatExtra, - Quox.CharExtra, + Quox.Unicode, Quox.OPE, Quox.Pretty, Quox.Syntax, diff --git a/tests/Tests.idr b/tests/Tests.idr index b6a7462..2053dbd 100644 --- a/tests/Tests.idr +++ b/tests/Tests.idr @@ -2,7 +2,7 @@ module Tests import Options import TAP -import Tests.CharExtra +import Tests.Unicode import Tests.Lexer import Tests.Parser import Tests.Equal @@ -10,7 +10,7 @@ import System allTests = [ - CharExtra.tests, + Unicode.tests, Lexer.tests, Parser.tests, Equal.tests diff --git a/tests/Tests/CharExtra.idr b/tests/Tests/CharExtra.idr deleted file mode 100644 index 594c25a..0000000 --- a/tests/Tests/CharExtra.idr +++ /dev/null @@ -1,56 +0,0 @@ -module Tests.CharExtra - -import Quox.NatExtra -import Quox.CharExtra -import Data.List -import Data.String -import Data.Maybe -import TAP - - -escape : Char -> Maybe String -escape '\'' = Nothing -escape c = - if ord c > 0xFF then Nothing else - case unpack $ show c of - '\'' :: '\\' :: cs => pack . ('\\' ::) <$> init' cs - _ => Nothing - -display : Char -> String -display c = - let ord = cast c {to = Nat} in - let c' = fromMaybe (singleton c) $ escape c in - if 0x20 <= ord && ord <= 0xFF then - "「\{c'}」" - else - let codepoint = padLeft 4 '0' $ showHex ord in - "「\{c'}」 (U+\{codepoint})" - -testOne : (Char -> Bool) -> Char -> Test -testOne pred c = test (display c) $ unless (pred c) $ Left () - -testAll : String -> (Char -> Bool) -> List Char -> Test -testAll label pred chars = label :- map (testOne pred) chars - - -tests = "char extras" :- [ - testAll "id starts" isIdStart - ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '𝟙'], - testAll "not id starts" (not . isIdStart) - ['0', '_', '-', '‿', ' ', '[', ',', '.', '\1'], - testAll "id continuations" isIdCont - ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '0', '\''], - testAll "not id continuations" (not . isIdCont) - ['_', '‿', ' ', '[', ',', '.', '\1'], - testAll "id connectors" isIdConnector - ['_', '‿'], - testAll "not id connectors" (not . isIdConnector) - ['a', ' ', ',', '-'], - testAll "white space" isWhitespace - [' ', '\t', '\r', '\n', - '\x2028', -- line separator - '\x2029' -- paragraph separator - ], - testAll "not white space" (not . isWhitespace) - ['a', '-', '_', '\1'] -] diff --git a/tests/Tests/Unicode.idr b/tests/Tests/Unicode.idr new file mode 100644 index 0000000..e3b0c1c --- /dev/null +++ b/tests/Tests/Unicode.idr @@ -0,0 +1,90 @@ +module Tests.Unicode + +import Quox.NatExtra +import Quox.Unicode +import Data.List +import Data.String +import Data.Maybe +import TAP + + +maxLatin1 = '\xFF' + +escape : Char -> Maybe String +escape '\'' = Nothing +escape c = + if c > maxLatin1 then Nothing else + case unpack $ show c of + '\'' :: '\\' :: cs => pack . ('\\' ::) <$> init' cs + _ => Nothing + +codepoint : Char -> String +codepoint = padLeft 4 '0' . showHex . cast + +display : Char -> String +display c = + let c' = fromMaybe (singleton c) $ escape c in + if '\x20' <= c && c <= maxLatin1 + then "「\{c'}」" + else "「\{c'}」 (U+\{codepoint c})" + +displayS' : String -> String +displayS' = + foldMap (\c => if c <= maxLatin1 then singleton c else "\\x\{codepoint c}") . + unpack + +displayS : String -> String +displayS str = + if all (<= maxLatin1) (unpack str) + then "「\{str}」" + else "「\{str}」 (\"\{displayS' str}\")" + +testOneChar : (Char -> Bool) -> Char -> Test +testOneChar pred c = test (display c) $ unless (pred c) $ Left () + +testAllChars : String -> (Char -> Bool) -> List Char -> Test +testAllChars label pred chars = label :- map (testOneChar pred) chars + + +testNfc : String -> String -> Test +testNfc input result = + test (displayS input) $ + let norm = normalizeNfc input in + unless (norm == result) $ + Left [("expected", displayS result), ("received", displayS norm)] + +testAlreadyNfc : String -> Test +testAlreadyNfc input = testNfc input input + + + +tests = "unicode" :- [ + "general categories" :- [ + testAllChars "id starts" isIdStart + ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '𝟙'], + testAllChars "not id starts" (not . isIdStart) + ['0', '_', '-', '‿', ' ', '[', ',', '.', '\1'], + testAllChars "id continuations" isIdCont + ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '0', '\''], + testAllChars "not id continuations" (not . isIdCont) + ['_', '‿', ' ', '[', ',', '.', '\1'], + testAllChars "id connectors" isIdConnector + ['_', '‿'], + testAllChars "not id connectors" (not . isIdConnector) + ['a', ' ', ',', '-'], + testAllChars "white space" isWhitespace + [' ', '\t', '\r', '\n', + '\x2028', -- line separator + '\x2029' -- paragraph separator + ], + testAllChars "not white space" (not . isWhitespace) + ['a', '-', '_', '\1'] + ], + + "normalisation" :- [ + testNfc "e\x301" "é", + testAlreadyNfc "é", + testAlreadyNfc "" + -- idk if this is wrong it's chez's fault. or unicode's + ] +]