normalizeNfc

This commit is contained in:
rhiannon morris 2022-05-12 07:41:58 +02:00
parent c743a99356
commit 274ecfb58c
5 changed files with 99 additions and 60 deletions

View file

@ -1,4 +1,4 @@
module Quox.CharExtra
module Quox.Unicode
import Generics.Derive
@ -133,3 +133,8 @@ isWhitespace : Char -> Bool
isWhitespace ch =
ch == '\t' || ch == '\r' || ch == '\n' ||
case genCat ch of Separator _ => True; _ => False
export
%foreign "scheme:string-normalize-nfc"
normalizeNfc : String -> String

View file

@ -9,7 +9,7 @@ depends = base, contrib, elab-util, sop, snocvect
modules =
Quox.NatExtra,
Quox.CharExtra,
Quox.Unicode,
Quox.OPE,
Quox.Pretty,
Quox.Syntax,

View file

@ -2,7 +2,7 @@ module Tests
import Options
import TAP
import Tests.CharExtra
import Tests.Unicode
import Tests.Lexer
import Tests.Parser
import Tests.Equal
@ -10,7 +10,7 @@ import System
allTests = [
CharExtra.tests,
Unicode.tests,
Lexer.tests,
Parser.tests,
Equal.tests

View file

@ -1,56 +0,0 @@
module Tests.CharExtra
import Quox.NatExtra
import Quox.CharExtra
import Data.List
import Data.String
import Data.Maybe
import TAP
escape : Char -> Maybe String
escape '\'' = Nothing
escape c =
if ord c > 0xFF then Nothing else
case unpack $ show c of
'\'' :: '\\' :: cs => pack . ('\\' ::) <$> init' cs
_ => Nothing
display : Char -> String
display c =
let ord = cast c {to = Nat} in
let c' = fromMaybe (singleton c) $ escape c in
if 0x20 <= ord && ord <= 0xFF then
"\{c'}」"
else
let codepoint = padLeft 4 '0' $ showHex ord in
"\{c'}」 (U+\{codepoint})"
testOne : (Char -> Bool) -> Char -> Test
testOne pred c = test (display c) $ unless (pred c) $ Left ()
testAll : String -> (Char -> Bool) -> List Char -> Test
testAll label pred chars = label :- map (testOne pred) chars
tests = "char extras" :- [
testAll "id starts" isIdStart
['a', 'á', '𝕕', '', 'ʨ', '𒁙', '𝟙'],
testAll "not id starts" (not . isIdStart)
['0', '_', '-', '', ' ', '[', ',', '.', '\1'],
testAll "id continuations" isIdCont
['a', 'á', '𝕕', '', 'ʨ', '𒁙', '0', '\''],
testAll "not id continuations" (not . isIdCont)
['_', '', ' ', '[', ',', '.', '\1'],
testAll "id connectors" isIdConnector
['_', ''],
testAll "not id connectors" (not . isIdConnector)
['a', ' ', ',', '-'],
testAll "white space" isWhitespace
[' ', '\t', '\r', '\n',
'\x2028', -- line separator
'\x2029' -- paragraph separator
],
testAll "not white space" (not . isWhitespace)
['a', '-', '_', '\1']
]

90
tests/Tests/Unicode.idr Normal file
View file

@ -0,0 +1,90 @@
module Tests.Unicode
import Quox.NatExtra
import Quox.Unicode
import Data.List
import Data.String
import Data.Maybe
import TAP
maxLatin1 = '\xFF'
escape : Char -> Maybe String
escape '\'' = Nothing
escape c =
if c > maxLatin1 then Nothing else
case unpack $ show c of
'\'' :: '\\' :: cs => pack . ('\\' ::) <$> init' cs
_ => Nothing
codepoint : Char -> String
codepoint = padLeft 4 '0' . showHex . cast
display : Char -> String
display c =
let c' = fromMaybe (singleton c) $ escape c in
if '\x20' <= c && c <= maxLatin1
then "\{c'}」"
else "\{c'}」 (U+\{codepoint c})"
displayS' : String -> String
displayS' =
foldMap (\c => if c <= maxLatin1 then singleton c else "\\x\{codepoint c}") .
unpack
displayS : String -> String
displayS str =
if all (<= maxLatin1) (unpack str)
then "\{str}」"
else "\{str}」 (\"\{displayS' str}\")"
testOneChar : (Char -> Bool) -> Char -> Test
testOneChar pred c = test (display c) $ unless (pred c) $ Left ()
testAllChars : String -> (Char -> Bool) -> List Char -> Test
testAllChars label pred chars = label :- map (testOneChar pred) chars
testNfc : String -> String -> Test
testNfc input result =
test (displayS input) $
let norm = normalizeNfc input in
unless (norm == result) $
Left [("expected", displayS result), ("received", displayS norm)]
testAlreadyNfc : String -> Test
testAlreadyNfc input = testNfc input input
tests = "unicode" :- [
"general categories" :- [
testAllChars "id starts" isIdStart
['a', 'á', '𝕕', '', 'ʨ', '𒁙', '𝟙'],
testAllChars "not id starts" (not . isIdStart)
['0', '_', '-', '', ' ', '[', ',', '.', '\1'],
testAllChars "id continuations" isIdCont
['a', 'á', '𝕕', '', 'ʨ', '𒁙', '0', '\''],
testAllChars "not id continuations" (not . isIdCont)
['_', '', ' ', '[', ',', '.', '\1'],
testAllChars "id connectors" isIdConnector
['_', ''],
testAllChars "not id connectors" (not . isIdConnector)
['a', ' ', ',', '-'],
testAllChars "white space" isWhitespace
[' ', '\t', '\r', '\n',
'\x2028', -- line separator
'\x2029' -- paragraph separator
],
testAllChars "not white space" (not . isWhitespace)
['a', '-', '_', '\1']
],
"normalisation" :- [
testNfc "e\x301" "é",
testAlreadyNfc "é",
testAlreadyNfc ""
-- idk if this is wrong it's chez's fault. or unicode's
]
]