normalizeNfc

2022-05-12 07:41:58 +02:00 · 2022-05-12 07:41:58 +02:00 · 274ecfb58c
commit 274ecfb58c
parent c743a99356
5 changed files with 99 additions and 60 deletions
--- a/lib/Quox/CharExtra.idr
+++ b/lib/Quox/CharExtra.idr
@ -1,4 +1,4 @@
-module Quox.CharExtra
+module Quox.Unicode
 import Generics.Derive
@ -133,3 +133,8 @@ isWhitespace : Char -> Bool
 isWhitespace ch =
  ch == '\t' || ch == '\r' || ch == '\n' ||
  case genCat ch of Separator _ => True; _ => False
 export
 %foreign "scheme:string-normalize-nfc"
 normalizeNfc : String -> String
--- a/lib/quox-lib.ipkg
+++ b/lib/quox-lib.ipkg
@ -9,7 +9,7 @@ depends = base, contrib, elab-util, sop, snocvect
 modules =
  Quox.NatExtra,
-  Quox.CharExtra,
+  Quox.Unicode,
  Quox.OPE,
  Quox.Pretty,
  Quox.Syntax,
--- a/tests/Tests.idr
+++ b/tests/Tests.idr
@ -2,7 +2,7 @@ module Tests
 import Options
 import TAP
-import Tests.CharExtra
+import Tests.Unicode
 import Tests.Lexer
 import Tests.Parser
 import Tests.Equal
@ -10,7 +10,7 @@ import System
 allTests = [
-  CharExtra.tests,
+  Unicode.tests,
  Lexer.tests,
  Parser.tests,
  Equal.tests
--- a/tests/Tests/CharExtra.idr
+++ b/tests/Tests/CharExtra.idr
@ -1,56 +0,0 @@
 module Tests.CharExtra
 import Quox.NatExtra
 import Quox.CharExtra
 import Data.List
 import Data.String
 import Data.Maybe
 import TAP
 escape : Char -> Maybe String
 escape '\'' = Nothing
 escape c =
  if ord c > 0xFF then Nothing else
  case unpack $ show c of
       '\'' :: '\\' :: cs => pack . ('\\' ::) <$> init' cs
       _                  => Nothing
 display : Char -> String
 display c =
  let ord = cast c {to = Nat} in
  let c'  = fromMaybe (singleton c) $ escape c in
  if 0x20 <= ord && ord <= 0xFF then
    "｢\{c'}｣"
  else
    let codepoint = padLeft 4 '0' $ showHex ord in
    "｢\{c'}｣ (U+\{codepoint})"
 testOne : (Char -> Bool) -> Char -> Test
 testOne pred c = test (display c) $ unless (pred c) $ Left ()
 testAll : String -> (Char -> Bool) -> List Char -> Test
 testAll label pred chars = label :- map (testOne pred) chars
 tests = "char extras" :- [
  testAll "id starts" isIdStart
    ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '𝟙'],
  testAll "not id starts" (not . isIdStart)
    ['0', '_', '-', '‿', ' ', '[', ',', '.', '\1'],
  testAll "id continuations" isIdCont
    ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '0', '\''],
  testAll "not id continuations" (not . isIdCont)
    ['_', '‿', ' ', '[', ',', '.', '\1'],
  testAll "id connectors" isIdConnector
    ['_', '‿'],
  testAll "not id connectors" (not . isIdConnector)
    ['a', ' ', ',', '-'],
  testAll "white space" isWhitespace
    [' ', '\t', '\r', '\n',
     '\x2028', -- line separator
     '\x2029'  -- paragraph separator
    ],
  testAll "not white space" (not . isWhitespace)
    ['a', '-', '_', '\1']
 ]
--- a/tests/Tests/Unicode.idr
+++ b/tests/Tests/Unicode.idr
@ -0,0 +1,90 @@
 module Tests.Unicode
 import Quox.NatExtra
 import Quox.Unicode
 import Data.List
 import Data.String
 import Data.Maybe
 import TAP
 maxLatin1 = '\xFF'
 escape : Char -> Maybe String
 escape '\'' = Nothing
 escape c =
  if c > maxLatin1 then Nothing else
  case unpack $ show c of
       '\'' :: '\\' :: cs => pack . ('\\' ::) <$> init' cs
       _                  => Nothing
 codepoint : Char -> String
 codepoint = padLeft 4 '0' . showHex . cast
 display : Char -> String
 display c =
  let c'  = fromMaybe (singleton c) $ escape c in
  if '\x20' <= c && c <= maxLatin1
     then "｢\{c'}｣"
     else "｢\{c'}｣ (U+\{codepoint c})"
 displayS' : String -> String
 displayS' =
  foldMap (\c => if c <= maxLatin1 then singleton c else "\\x\{codepoint c}") .
  unpack
 displayS : String -> String
 displayS str =
  if all (<= maxLatin1) (unpack str)
     then "｢\{str}｣"
     else "｢\{str}｣ (\"\{displayS' str}\")"
 testOneChar : (Char -> Bool) -> Char -> Test
 testOneChar pred c = test (display c) $ unless (pred c) $ Left ()
 testAllChars : String -> (Char -> Bool) -> List Char -> Test
 testAllChars label pred chars = label :- map (testOneChar pred) chars
 testNfc : String -> String -> Test
 testNfc input result =
  test (displayS input) $
  let norm = normalizeNfc input in
  unless (norm == result) $
  Left [("expected", displayS result), ("received", displayS norm)]
 testAlreadyNfc : String -> Test
 testAlreadyNfc input = testNfc input input
 tests = "unicode" :- [
  "general categories" :- [
    testAllChars "id starts" isIdStart
      ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '𝟙'],
    testAllChars "not id starts" (not . isIdStart)
      ['0', '_', '-', '‿', ' ', '[', ',', '.', '\1'],
    testAllChars "id continuations" isIdCont
      ['a', 'á', '𝕕', '개', 'ʨ', '𒁙', '0', '\''],
    testAllChars "not id continuations" (not . isIdCont)
      ['_', '‿', ' ', '[', ',', '.', '\1'],
    testAllChars "id connectors" isIdConnector
      ['_', '‿'],
    testAllChars "not id connectors" (not . isIdConnector)
      ['a', ' ', ',', '-'],
    testAllChars "white space" isWhitespace
      [' ', '\t', '\r', '\n',
       '\x2028', -- line separator
       '\x2029'  -- paragraph separator
      ],
    testAllChars "not white space" (not . isWhitespace)
      ['a', '-', '_', '\1']
  ],
  "normalisation" :- [
    testNfc        "e\x301" "é",
    testAlreadyNfc "é",
    testAlreadyNfc ""
    -- idk if this is wrong it's chez's fault. or unicode's
  ]
 ]