normalizeNfc

2022-05-12 07:41:58 +02:00 · 2022-05-12 07:41:58 +02:00 · 274ecfb58c
commit 274ecfb58c
parent c743a99356
5 changed files with 99 additions and 60 deletions
--- a/lib/Quox/CharExtra.idr
+++ b/lib/Quox/CharExtra.idr
@ -1,4 +1,4 @@
-module Quox.CharExtra
+module Quox.Unicode

 import Generics.Derive

@ -133,3 +133,8 @@ isWhitespace : Char -> Bool
 isWhitespace ch =
  ch == '\t' || ch == '\r' || ch == '\n' ||
  case genCat ch of Separator _ => True; _ => False
+
+
+export
+%foreign "scheme:string-normalize-nfc"
+normalizeNfc : String -> String
--- a/lib/quox-lib.ipkg
+++ b/lib/quox-lib.ipkg
@ -9,7 +9,7 @@ depends = base, contrib, elab-util, sop, snocvect

 modules =
  Quox.NatExtra,
-  Quox.CharExtra,
+  Quox.Unicode,
  Quox.OPE,
  Quox.Pretty,
  Quox.Syntax,
--- a/tests/Tests.idr
+++ b/tests/Tests.idr
@ -2,7 +2,7 @@ module Tests

 import Options
 import TAP
-import Tests.CharExtra
+import Tests.Unicode
 import Tests.Lexer
 import Tests.Parser
 import Tests.Equal
@ -10,7 +10,7 @@ import System


 allTests = [
-  CharExtra.tests,
+  Unicode.tests,
  Lexer.tests,
  Parser.tests,
  Equal.tests
--- a/tests/Tests/CharExtra.idr
+++ b/tests/Tests/CharExtra.idr
@ -1,56 +0,0 @@
-module Tests.CharExtra
-
-import Quox.NatExtra
-import Quox.CharExtra
-import Data.List
-import Data.String
-import Data.Maybe
-import TAP
-
-
-escape : Char -> Maybe String
-escape '\'' = Nothing
-escape c =
-  if ord c > 0xFF then Nothing else
-  case unpack $ show c of
-       '\'' :: '\\' :: cs => pack . ('\\' ::) <$> init' cs
-       _                  => Nothing
-
-display : Char -> String
-display c =
-  let ord = cast c {to = Nat} in
-  let c'  = fromMaybe (singleton c) $ escape c in
-  if 0x20 <= ord && ord <= 0xFF then
-    "｢\{c'}｣"
-  else
-    let codepoint = padLeft 4 '0' $ showHex ord in
-    "｢\{c'}｣ (U+\{codepoint})"
-
-testOne : (Char -> Bool) -> Char -> Test
-testOne pred c = test (display c) $ unless (pred c) $ Left ()
-
-testAll : String -> (Char -> Bool) -> List Char -> Test
-testAll label pred chars = label :- map (testOne pred) chars
-
-
-tests = "char extras" :- [
-  testAll "id starts" isIdStart
-    ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '𝟙'],
-  testAll "not id starts" (not . isIdStart)
-    ['0', '_', '-', '‿', ' ', '[', ',', '.', '\1'],
-  testAll "id continuations" isIdCont
-    ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '0', '\''],
-  testAll "not id continuations" (not . isIdCont)
-    ['_', '‿', ' ', '[', ',', '.', '\1'],
-  testAll "id connectors" isIdConnector
-    ['_', '‿'],
-  testAll "not id connectors" (not . isIdConnector)
-    ['a', ' ', ',', '-'],
-  testAll "white space" isWhitespace
-    [' ', '\t', '\r', '\n',
-     '\x2028', -- line separator
-     '\x2029'  -- paragraph separator
-    ],
-  testAll "not white space" (not . isWhitespace)
-    ['a', '-', '_', '\1']
-]
--- a/tests/Tests/Unicode.idr
+++ b/tests/Tests/Unicode.idr
@ -0,0 +1,90 @@
+module Tests.Unicode
+
+import Quox.NatExtra
+import Quox.Unicode
+import Data.List
+import Data.String
+import Data.Maybe
+import TAP
+
+
+maxLatin1 = '\xFF'
+
+escape : Char -> Maybe String
+escape '\'' = Nothing
+escape c =
+  if c > maxLatin1 then Nothing else
+  case unpack $ show c of
+       '\'' :: '\\' :: cs => pack . ('\\' ::) <$> init' cs
+       _                  => Nothing
+
+codepoint : Char -> String
+codepoint = padLeft 4 '0' . showHex . cast
+
+display : Char -> String
+display c =
+  let c'  = fromMaybe (singleton c) $ escape c in
+  if '\x20' <= c && c <= maxLatin1
+     then "｢\{c'}｣"
+     else "｢\{c'}｣ (U+\{codepoint c})"
+
+displayS' : String -> String
+displayS' =
+  foldMap (\c => if c <= maxLatin1 then singleton c else "\\x\{codepoint c}") .
+  unpack
+
+displayS : String -> String
+displayS str =
+  if all (<= maxLatin1) (unpack str)
+     then "｢\{str}｣"
+     else "｢\{str}｣ (\"\{displayS' str}\")"
+
+testOneChar : (Char -> Bool) -> Char -> Test
+testOneChar pred c = test (display c) $ unless (pred c) $ Left ()
+
+testAllChars : String -> (Char -> Bool) -> List Char -> Test
+testAllChars label pred chars = label :- map (testOneChar pred) chars
+
+
+testNfc : String -> String -> Test
+testNfc input result =
+  test (displayS input) $
+  let norm = normalizeNfc input in
+  unless (norm == result) $
+  Left [("expected", displayS result), ("received", displayS norm)]
+
+testAlreadyNfc : String -> Test
+testAlreadyNfc input = testNfc input input
+
+
+
+tests = "unicode" :- [
+  "general categories" :- [
+    testAllChars "id starts" isIdStart
+      ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '𝟙'],
+    testAllChars "not id starts" (not . isIdStart)
+      ['0', '_', '-', '‿', ' ', '[', ',', '.', '\1'],
+    testAllChars "id continuations" isIdCont
+      ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '0', '\''],
+    testAllChars "not id continuations" (not . isIdCont)
+      ['_', '‿', ' ', '[', ',', '.', '\1'],
+    testAllChars "id connectors" isIdConnector
+      ['_', '‿'],
+    testAllChars "not id connectors" (not . isIdConnector)
+      ['a', ' ', ',', '-'],
+    testAllChars "white space" isWhitespace
+      [' ', '\t', '\r', '\n',
+       '\x2028', -- line separator
+       '\x2029'  -- paragraph separator
+      ],
+    testAllChars "not white space" (not . isWhitespace)
+      ['a', '-', '_', '\1']
+  ],
+
+  "normalisation" :- [
+    testNfc        "e\x301" "é",
+    testAlreadyNfc "é",
+    testAlreadyNfc ""
+    -- idk if this is wrong it's chez's fault. or unicode's
+  ]
+]